In [1]:
# (a) IMPORTS  — sentence-transformers, pandas, scikit-learn (plus numpy)
import pandas as pd
import numpy as np

# SentenceTransformers for MiniLM model + cosine sim helper
from sentence_transformers import SentenceTransformer, util

# We’ll also import scikit-learn as requested (we'll show both cosine paths)
from sklearn.metrics.pairwise import cosine_similarity


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# (b) LOAD DATA — expects columns: 'title' and 'plot'
csv_path = "Assignment-1\movies.csv"  # change if your CSV is elsewhere

df = pd.read_csv(csv_path)

# Basic sanity checks / normalization
# Ensure required columns exist (case-insensitive rename if needed)
lower_map = {c.lower(): c for c in df.columns}
if "title" in lower_map and "plot" in lower_map:
    df = df.rename(columns={lower_map["title"]: "title", lower_map["plot"]: "plot"})
else:
    raise ValueError("CSV must contain 'title' and 'plot' columns.")

# Ensure string dtype
df["title"] = df["title"].astype(str)
df["plot"]  = df["plot"].astype(str)

# Peek
df.head()


  csv_path = "Assignment-1\movies.csv"  # change if your CSV is elsewhere


Unnamed: 0,title,plot
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...
1,Romance in Paris,A couple falls in love in Paris under romantic...
2,Action Flick,A high-octane chase through New York with expl...


In [6]:
# (c) EMBEDDINGS — create normalized embeddings for plots using MiniLM
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(MODEL_NAME)

# Convert plots to a list of strings
plots = df["plot"].tolist()

# Encode: convert_to_numpy=True returns a NumPy array
# normalize_embeddings=True L2-normalizes vectors (cosine sim = dot product)
embeddings = model.encode(
    plots,
    convert_to_numpy=True,
    normalize_embeddings=True
)

print("Embeddings shape:", embeddings.shape)  # (num_movies, 384)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Embeddings shape: (3, 384)


In [7]:
def search_movies(query: str, top_n: int = 5,
                  use_sklearn: bool = False) -> pd.DataFrame:
    """
    Given a text query, return the top_n most relevant movies based on
    cosine similarity to plot embeddings.

    Parameters
    ----------
    query : str
        Your search text, e.g., 'spy thriller in Paris'
    top_n : int
        Number of results to return (sorted by similarity desc)
    use_sklearn : bool
        If True, compute cosine similarity with scikit-learn.
        If False (default), use sentence-transformers' util.cos_sim.

    Returns
    -------
    pd.DataFrame with columns ['title', 'plot', 'similarity']
    """
    # 1) Encode and normalize the query
    q_emb = model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=True
    )  # shape: (1, dim)

    # 2) Compute cosine similarity (two equivalent ways)
    if use_sklearn:
        # cosine_similarity expects 2D arrays; returns shape (1, N)
        sims = cosine_similarity(q_emb, embeddings)[0]
    else:
        # util.cos_sim supports NumPy arrays too; returns (1, N)
        sims = util.cos_sim(q_emb, embeddings).cpu().numpy().flatten()

    # 3) Grab top_n indices efficiently, then sort them by similarity desc
    top_n = max(0, min(top_n, len(sims)))
    if top_n == 0:
        return pd.DataFrame(columns=["title", "plot", "similarity"])

    idx = np.argpartition(-sims, top_n - 1)[:top_n]      # unsorted top_k
    idx = idx[np.argsort(-sims[idx])]                    # sort those top_k

    # 4) Build the result DataFrame
    out = df.iloc[idx].copy().reset_index(drop=True)
    out["similarity"] = sims[idx].astype(float)
    return out[["title", "plot", "similarity"]]


In [8]:
results = search_movies("spy thriller in Paris", top_n=5)
results


Unnamed: 0,title,plot,similarity
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...,0.769684
1,Romance in Paris,A couple falls in love in Paris under romantic...,0.388029
2,Action Flick,A high-octane chase through New York with expl...,0.256777


In [9]:
results_sklearn = search_movies("spy thriller in Paris", top_n=5, use_sklearn=True)
results_sklearn


Unnamed: 0,title,plot,similarity
0,Spy Movie,A spy navigates intrigue in Paris to stop a te...,0.769684
1,Romance in Paris,A couple falls in love in Paris under romantic...,0.388029
2,Action Flick,A high-octane chase through New York with expl...,0.256777
