In [2]:
import pandas as pd
movies=pd.read_csv("movies_final.csv")
display(movies.head())
print("shape:",movies.shape)

Unnamed: 0,id,title,tags
0,19995,Avatar,action adventure fantasy science fiction cultu...
1,285,Pirates of the Caribbean: At World's End,adventure fantasy action ocean drug abuse exot...
2,206647,Spectre,action adventure crime spy based on novel secr...
3,49026,The Dark Knight Rises,action crime drama thriller dc comics crime fi...
4,49529,John Carter,action adventure science fiction based on nove...


shape: (4803, 3)


In [3]:
# STEP 2: TF-IDF + Cosine Similarity + recommend() function
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# 1) TF-IDF vectorizer (ignore English stop words)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])   # shape: (4803, 5000) approx

# 2) Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 3) Build a reverse mapping (title -> index)
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# 4) recommend function
def recommend(title, top_n=5):
    # handle title not found
    if title not in indices:
        print(f"Title not found: {title}")
        return []
    idx = indices[title]
    # get pairwise similarity scores for this movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    # sort by similarity score (highest first) and skip the first (itself)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1: top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices].tolist()

# 5) Quick test (example)
print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Cosine matrix shape:", cosine_sim.shape)

# Example: get recommendations for 'Avatar' (exact title match required)
print("Recommendations for 'Avatar':", recommend("Avatar", top_n=5))


TF-IDF matrix shape: (4803, 5000)
Cosine matrix shape: (4803, 4803)
Recommendations for 'Avatar': ['Lifeforce', 'Moonraker', 'Lost in Space', 'Guardians of the Galaxy', 'Aliens']


### ✅ Day 4 — Model Building (TF-IDF + Cosine Similarity)

- Loaded `movies_final.csv`
- Converted movie "tags" into numerical vectors using TF-IDF
- Used a vocabulary size of 5000 features and removed English stop words
- Computed cosine similarity between all movies (4803 × 4803 matrix)
- Built a `recommend(title)` function that returns the top 5 similar movies
- Tested the model successfully (e.g., recommendations for "Avatar")
