# Cosine Similarity Inconsistency Issue

In [1]:
import numpy as np
from langchain_core.vectorstores.utils import _cosine_similarity

x = np.array([[1.0, 2.0]]) 
y = np.array([])


result = _cosine_similarity(x, y)
print(result.shape)  # Got (0,), expected (1, 0)

(0,)


In [2]:
import numpy as np
from langchain_core.vectorstores.utils import _cosine_similarity

# Test case 1: The problematic case
x = np.array([[1.0, 2.0]]) 
y = np.array([])
result = _cosine_similarity(x, y)
print(f"Case 1 - x: {x.shape}, y: {y.shape}, result: {result.shape}")

# Test case 2: Normal case for comparison
x = np.array([[1.0, 2.0]]) 
y = np.array([[3.0, 4.0], [1.0, 1.0]])
result = _cosine_similarity(x, y)
print(f"Case 2 - x: {x.shape}, y: {y.shape}, result: {result.shape}")

# Test case 3: Empty x, non-empty y
x = np.array([]).reshape(0, 2)
y = np.array([[3.0, 4.0]])
result = _cosine_similarity(x, y)
print(f"Case 3 - x: {x.shape}, y: {y.shape}, result: {result.shape}")

# Test case 4: Both empty
x = np.array([]).reshape(0, 2)
y = np.array([]).reshape(0, 2)
result = _cosine_similarity(x, y)
print(f"Case 4 - x: {x.shape}, y: {y.shape}, result: {result.shape}")

Case 1 - x: (1, 2), y: (0,), result: (0,)
Case 2 - x: (1, 2), y: (2, 2), result: (1, 2)
Case 3 - x: (0, 2), y: (1, 2), result: (0,)
Case 4 - x: (0, 2), y: (0, 2), result: (0,)


In [3]:
# Look at the actual implementation
import inspect
print(inspect.getsource(_cosine_similarity))

def _cosine_similarity(x: Matrix, y: Matrix) -> np.ndarray:
    """Row-wise cosine similarity between two equal-width matrices.

    Args:
        x: A matrix of shape (n, m).
        y: A matrix of shape (k, m).

    Returns:
        A matrix of shape (n, k) where each element (i, j) is the cosine similarity
        between the ith row of X and the jth row of Y.

    Raises:
        ValueError: If the number of columns in X and Y are not the same.
        ImportError: If numpy is not installed.
    """
    try:
        import numpy as np
    except ImportError as e:
        msg = (
            "cosine_similarity requires numpy to be installed. "
            "Please install numpy with `pip install numpy`."
        )
        raise ImportError(msg) from e

    if len(x) == 0 or len(y) == 0:
        return np.array([])

    x = np.array(x)
    y = np.array(y)
    if x.shape[1] != y.shape[1]:
        msg = (
            f"Number of columns in X and Y must be the same. X has shape {x.shape}

## Test Cases

In [4]:
def test_cosine_similarity_edge_cases():
    """Test various edge cases to understand the behavior pattern"""
    test_cases = [
        (np.array([[1.0, 2.0]]), np.array([]), "non-empty x, empty y"),
        (np.array([]).reshape(0, 2), np.array([[3.0, 4.0]]), "empty x, non-empty y"),
        (np.array([]).reshape(0, 2), np.array([]).reshape(0, 2), "both empty"),
        (np.array([[1.0, 2.0], [3.0, 4.0]]), np.array([]), "multiple x, empty y"),
    ]
    
    for x, y, description in test_cases:
        try:
            result = _cosine_similarity(x, y)
            print(f"{description}: x{x.shape} × y{y.shape} → result{result.shape}")
        except Exception as e:
            print(f"{description}: ERROR - {e}")

In [5]:
test_cosine_similarity_edge_cases()

non-empty x, empty y: x(1, 2) × y(0,) → result(0,)
empty x, non-empty y: x(0, 2) × y(1, 2) → result(0,)
both empty: x(0, 2) × y(0, 2) → result(0,)
multiple x, empty y: x(2, 2) × y(0,) → result(0,)


In [6]:
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine

# Test sklearn's behavior
try:
    x = np.array([[1.0, 2.0]])
    y = np.array([]).reshape(0, 2)  # Properly shaped empty array
    sklearn_result = sklearn_cosine(x, y)
    print(f"sklearn result shape: {sklearn_result.shape}")
except Exception as e:
    print(f"sklearn error: {e}")

sklearn error: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by check_pairwise_arrays.
