In [1]:
!kaggle datasets download grouplens/movielens-20m-dataset

Dataset URL: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset
License(s): unknown
Downloading movielens-20m-dataset.zip to /content
 92% 180M/195M [00:00<00:00, 268MB/s]
100% 195M/195M [00:00<00:00, 256MB/s]


In [2]:

import zipfile
import os

!kaggle datasets download grouplens/movielens-20m-dataset

# Assuming the zip file is named 'movielens-20m-dataset.zip'
# Replace with the actual filename if it's different
zip_file_name = 'movielens-20m-dataset.zip'

# Specify the directory where you want to extract the files
extract_dir = 'movielens-20m'  # Or any other name you prefer

with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print(f"Files extracted to: {extract_dir}")


Dataset URL: https://www.kaggle.com/datasets/grouplens/movielens-20m-dataset
License(s): unknown
movielens-20m-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Files extracted to: movielens-20m


In [3]:
!pip install surprise --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m153.6/154.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone


In [4]:
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load datasets
ratings = pd.read_csv("/content/movielens-20m/rating.csv")
movies = pd.read_csv("/content/movielens-20m/movie.csv")
tags = pd.read_csv("/content/movielens-20m/genome_tags.csv")
tag_names = pd.read_csv("/content/movielens-20m/genome_scores.csv")

# Merge tags and compute TF-IDF for content-based filtering
tags = tags.merge(tag_names, on="tagId", how="left")
movie_tags = tags.groupby("movieId")["tag"].apply(lambda x: " ".join(x)).reset_index()

# Merge with movie metadata
movies = movies.merge(movie_tags, on="movieId", how="left").fillna("")

# Content-Based Filtering (CBF) - Compute Movie Similarity
tfidf = TfidfVectorizer(stop_words="english")
movie_tfidf_matrix = tfidf.fit_transform(movies["tag"])
cosine_sim = cosine_similarity(movie_tfidf_matrix, movie_tfidf_matrix)

# Collaborative Filtering (CF) - Train an SVD Model
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)
trainset = data.build_full_trainset()
svd = SVD()
svd.fit(trainset)

# Weighted Hybrid Prediction Function
def hybrid_recommend(user_id, movie_id, alpha=0.7):
    """Combines CF & CBF scores using a weighted sum."""

    # Collaborative Filtering Prediction (SVD)
    cf_score = svd.predict(user_id, movie_id).est

    # Content-Based Filtering Prediction (Cosine Similarity)
    try:
        movie_idx = movies[movies["movieId"] == movie_id].index[0]
        similarities = cosine_sim[movie_idx]
        similar_movie_indices = np.argsort(similarities)[::-1][1:11]
        cbf_score = np.mean([svd.predict(user_id, movies.iloc[i]["movieId"]).est for i in similar_movie_indices])
    except:
        cbf_score = 3.0  # Default neutral score if no similarity found

    # Weighted Combination
    final_score = alpha * cf_score + (1 - alpha) * cbf_score
    return final_score

# Example: Predict for user 1 and movie 50
predicted_rating = hybrid_recommend(user_id=1, movie_id=50)
print(f"Hybrid Predicted Rating: {predicted_rating:.2f}")



Hybrid Predicted Rating: 3.84


In [5]:
# prompt: generate some recommendations

def get_recommendations(user_id, top_n=10):
    """Provides top N movie recommendations for a user."""

    # Get movies the user has not rated
    rated_movie_ids = ratings[ratings["userId"] == user_id]["movieId"].unique()
    unrated_movie_ids = movies[~movies["movieId"].isin(rated_movie_ids)]["movieId"]

    # Predict ratings for unrated movies
    predictions = []
    for movie_id in unrated_movie_ids:
        predictions.append((movie_id, hybrid_recommend(user_id, movie_id)))

    # Sort predictions and get top N
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = predictions[:top_n]

    # Get movie titles
    recommended_movies = []
    for movie_id, predicted_rating in top_recommendations:
        movie_title = movies[movies["movieId"] == movie_id]["title"].iloc[0]
        recommended_movies.append((movie_title, predicted_rating))

    return recommended_movies

# Example: Get recommendations for user 1
recommendations = get_recommendations(user_id=1)
print("Top Recommendations for User 1:")
for movie, rating in recommendations:
    print(f"- {movie} (Predicted Rating: {rating:.2f})")


Top Recommendations for User 1:
- Lifted (2006) (Predicted Rating: 4.24)
- Twelve Tasks of Asterix, The (Les douze travaux d'Astérix) (1976) (Predicted Rating: 4.21)
- Prime Suspect (1991) (Predicted Rating: 4.21)
- The War (2007) (Predicted Rating: 4.21)
- Personal Journey with Martin Scorsese Through American Movies, A (1995) (Predicted Rating: 4.20)
- Zero Motivation (Efes beyahasei enosh) (2014) (Predicted Rating: 4.19)
- Something Is Happening (Kuch Kuch Hota Hai) (1998) (Predicted Rating: 4.18)
- Crooks in Clover (a.k.a. Monsieur Gangster) (Les tontons flingueurs) (1963) (Predicted Rating: 4.17)
- Frozen Planet (2011) (Predicted Rating: 4.17)
- North & South (2004) (Predicted Rating: 4.16)


In [None]:
# prompt: save the model

import pickle

# Save the trained SVD model
with open('svd_model.pkl', 'wb') as f:
    pickle.dump(svd, f)

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the cosine similarity matrix
with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

print("Model saved successfully!")
