# Content-Based Movie Recommender

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported!")

features_df = pd.read_csv('../date/processed/movies_features.csv')
titles_df   = pd.read_csv('../date/processed/movie_titles.csv')


X_features = features_df.values
titles = titles_df['Title'].tolist()

print("Loaded data successfully!")
print(f"Shape of feature matrix: {X_features.shape[0]} movies × {X_features.shape[1]} embedding features")
print("\nFirst 5 movie titles:")
print(titles[:5])

Libraries imported!
Loaded data successfully!
Shape of feature matrix: 2160 movies × 385 embedding features

First 5 movie titles:
['Radiohead: In Rainbows - From the Basement', 'Band of Brothers', 'Harakiri', 'Stop Making Sense', 'No Half Measures: Creating the Final Season of Breaking Bad']


In [3]:
print("Calculating similarity matrix using cosine similarity...")
print("(This will be very fast with embeddings)")


X_features = np.nan_to_num(X_features)

similarity_matrix = cosine_similarity(X_features)

print("Done!")
print(f"Similarity matrix shape: {similarity_matrix.shape}")
print(f"   → {similarity_matrix.shape[0]} movies × {similarity_matrix.shape[1]} movies")


similarity_df = pd.DataFrame(
    similarity_matrix,
    index=titles,
    columns=titles
)

print("\nExample: How similar is the first movie to the others?")
first_title = titles[0]
print(f"\nMovie: {first_title}")
print(similarity_df.loc[first_title].sort_values(ascending=False).head(6))


Calculating similarity matrix using cosine similarity...
(This will be very fast with embeddings)
Done!
Similarity matrix shape: (2160, 2160)
   → 2160 movies × 2160 movies

Example: How similar is the first movie to the others?

Movie: Radiohead: In Rainbows - From the Basement
Radiohead: In Rainbows - From the Basement    1.000000
Nirvana: Unplugged In New York                0.684276
The Pop Out: Ken & Friends                    0.663512
Chernobyl                                     0.663323
Band of Brothers                              0.658962
Planet Earth                                  0.643614
Name: Radiohead: In Rainbows - From the Basement, dtype: float64


In [4]:
def get_recommendations(movie_title, n_recommendations=10):
    """ Get movie recommendations based on cosine similarity of embeddings """

    if movie_title not in similarity_df.index:
        print(f"Error: '{movie_title}' not found in the database!")
        print("Tip: Try using search_movie() to find exact titles.")
        return None

    similarity_scores = similarity_df.loc[movie_title]


    top_similar = similarity_scores.sort_values(ascending=False).iloc[1:n_recommendations+1]


    recommendations = pd.DataFrame({
        'Rank': range(1, len(top_similar) + 1),
        'Movie': top_similar.index,
        'Similarity Score': top_similar.values,
        'Match %': (top_similar.values * 100).round(1)
    })

    return recommendations


In [5]:
def search_movie(keyword):
    """Search for movies whose title contains a given keyword (case-insensitive)."""
    keyword = keyword.lower()

    matches = [title for title in titles if keyword in title.lower()]

    if len(matches) == 0:
        print(f" No movies found containing '{keyword}'.")
        return None

    print(f"Found {len(matches)} movies containing '{keyword}':\n")
    for m in matches:
        print(f"  → {m}")

    return matches


In [6]:
import pickle
from pathlib import Path


recommender_system = {
    'similarity_matrix': similarity_matrix,
    'similarity_df': similarity_df,
    'titles': titles,
    'X_features': X_features,
}


output_path = Path('../models')
output_path.mkdir(parents=True, exist_ok=True)


with open(output_path / 'content_recommender.pkl', 'wb') as f:
    pickle.dump(recommender_system, f)

print("\nRecommender system saved successfully!")
print("Saved to: ../models/content_recommender.pkl")
print("\nModel includes:")
print(f"  • Similarity matrix: {similarity_matrix.shape}")
print(f"  • Number of movies: {len(titles)}")
print(f"  • Embedding dimensions: {X_features.shape[1]}")



Recommender system saved successfully!
Saved to: ../models/content_recommender.pkl

Model includes:
  • Similarity matrix: (2160, 2160)
  • Number of movies: 2160
  • Embedding dimensions: 385
