In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



In [2]:
# Load datasets
movies = pd.read_csv("movies_metadata_cleaned.csv")  # Use cleaned movie dataset
ratings = pd.read_csv("ratings_small.csv")  # User ratings dataset

# Fill missing overviews
movies['overview'] = movies['overview'].fillna("")


In [3]:
# Define rating scale (0.5 to 5.0)
reader = Reader(rating_scale=(0.5, 5))

# Load dataset into Surprise format
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split dataset into train (80%) and test (20%)
trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
svd = SVD()
svd.fit(trainset)

# Evaluate model using RMSE
predictions = svd.test(testset)
rmse = accuracy.rmse(predictions)
print("SVD Model RMSE:", rmse)


RMSE: 0.8998
SVD Model RMSE: 0.8998056963571979


In [4]:
# Convert movie overviews into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['overview'])

# Compute Cosine Similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [5]:
def hybrid_recommendation(user_id, n=5):
    """Combines Collaborative Filtering (SVD) & Content-Based Filtering (TF-IDF)."""
    
    # Step 1: Get Top N Recommended Movies for User (Collaborative Filtering)
    all_movie_ids = movies['id'].unique()
    rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].unique()
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    
    # Predict ratings for unrated movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in unrated_movie_ids]
    
    # Sort by highest predicted rating
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top N recommended movies
    top_movie_ids = [pred.iid for pred in predictions[:n]]
    
    # Step 2: Find Similar Movies for Each Recommended Movie (Content-Based)
    recommended_movies = []
    for movie_id in top_movie_ids:
        # Find movie title
        movie_title = movies[movies['id'] == movie_id]['title'].values[0]
        
        # Find similar movies
        idx = movies[movies['id'] == movie_id].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:3]  # Get top 2 similar movies
        
        # Get similar movie titles
        similar_movie_ids = [movies.iloc[i[0]]['id'] for i in sim_scores]
        similar_movie_titles = movies[movies['id'].isin(similar_movie_ids)]['title'].tolist()
        
        # Add to final recommendation list
        recommended_movies.append({"Movie": movie_title, "Similar Movies": similar_movie_titles})
    
    return recommended_movies

# Example: Get Hybrid Recommendations for User 1
hybrid_recs = hybrid_recommendation(user_id=1, n=5)
for rec in hybrid_recs:
    print(f"\nRecommended Movie: {rec['Movie']}")
    print(f"→ Similar Movies: {', '.join(rec['Similar Movies'])}")



Recommended Movie: The Million Dollar Hotel
→ Similar Movies: The Silence, Hotel Rwanda

Recommended Movie: Sleepless in Seattle
→ Similar Movies: The Shaggy Dog, Why Him?

Recommended Movie: Nell
→ Similar Movies: The Cabin in the Woods, Cabin Fever

Recommended Movie: Terminator 3: Rise of the Machines
→ Similar Movies: Terminator 2: Judgment Day, Terminator Salvation

Recommended Movie: Once Were Warriors
→ Similar Movies: The First Day of the Rest of Your Life, 2012: Ice Age
