In [None]:
# Movie Recommendation Project using MovieLens 100k Dataset

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.neighbors import NearestNeighbors
import urllib.request
import zipfile
import os

# Step 1: Download and extract MovieLens 100k dataset
dataset_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
dataset_path = "ml-100k.zip"
extract_folder = "ml-100k"

if not os.path.exists(extract_folder):
    print("Downloading MovieLens 100k dataset...")
    urllib.request.urlretrieve(dataset_url, dataset_path)
    print("Extracting...")
    with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
        zip_ref.extractall()
    print("Done!")

# Step 2: Load movies and ratings data

# Movies info: 'u.item' file — pipe-separated, ISO-8859-1 encoding for special chars
movies_cols = ['movieId', 'title', 'release_date', 'video_release_date', 'IMDb_URL'] + \
              ['unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
               'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
               'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv(os.path.join(extract_folder, 'u.item'), sep='|', names=movies_cols,
                     encoding='latin-1')

# Ratings data: 'u.data' file — tab separated
ratings_cols = ['userId', 'movieId', 'rating', 'timestamp']
ratings = pd.read_csv(os.path.join(extract_folder, 'u.data'), sep='\t', names=ratings_cols)

print(f"Movies dataset shape: {movies.shape}")
print(f"Ratings dataset shape: {ratings.shape}")

# Step 3: Content-based similarity using genre vectors

# Extract genre columns as features (0/1)
genre_cols = movies.columns[5:]

# Create genre matrix
genre_matrix = movies[genre_cols].values

# Compute cosine similarity between movies by genre
genre_sim = cosine_similarity(genre_matrix)

# Helper function to get top N similar movies by genre
def get_similar_movies(movie_title, top_n=5):
    try:
        idx = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        return f"Movie '{movie_title}' not found."
    sim_scores = list(enumerate(genre_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]  # exclude the movie itself
    similar_movies = [(movies.iloc[i]['title'], score) for i, score in sim_scores]
    return similar_movies

print("Example similar movies to 'Toy Story (1995)':")
print(get_similar_movies('Toy Story (1995)'))

# Step 4: Collaborative filtering with SVD

# Create user-item matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Train-test split (by ratings)
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Create train user-item matrix
train_matrix = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Train SVD model on train data
svd = TruncatedSVD(n_components=20, random_state=42)
user_factors = svd.fit_transform(train_matrix)
movie_factors = svd.components_.T

# Predict ratings matrix for train users and movies
predicted_ratings = np.dot(user_factors, movie_factors.T)
pred_df = pd.DataFrame(predicted_ratings, index=train_matrix.index, columns=train_matrix.columns)

# Recommend movies for a user based on predicted ratings
def recommend_movies_svd(user_id, top_n=5):
    if user_id not in pred_df.index:
        return f"User {user_id} not in training data."
    user_pred = pred_df.loc[user_id]
    rated_movies = train_data[train_data['userId'] == user_id]['movieId'].tolist()
    user_pred = user_pred.drop(rated_movies, errors='ignore')
    top_movies = user_pred.sort_values(ascending=False).head(top_n)
    return [(movies[movies['movieId'] == mid]['title'].values[0], score) for mid, score in top_movies.items()]

print("\nSVD Recommendations for User 1:")
print(recommend_movies_svd(1))

# Step 5: Hybrid approach using KNN on movie latent factors

# Build KNN model on movie latent features
knn = NearestNeighbors(n_neighbors=6, metric='cosine')
knn.fit(movie_factors)

def get_similar_movies_svd(movie_title, top_n=5):
    try:
        idx = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        return f"Movie '{movie_title}' not found."
    movie_vec = movie_factors[idx].reshape(1, -1)
    distances, indices = knn.kneighbors(movie_vec, n_neighbors=top_n+1)
    similar_indices = indices.flatten()[1:]  # exclude itself
    similar_movies = [(movies.iloc[i]['title'], 1 - distances.flatten()[j+1]) for j, i in enumerate(similar_indices)]
    return similar_movies

print("\nHybrid SVD-KNN similar movies to 'Toy Story (1995)':")
print(get_similar_movies_svd('Toy Story (1995)'))

# Step 6: Evaluate with NDCG on test data

# Prepare test user-item matrix for known users/movies
test_users = test_data['userId'].unique()
ndcg_scores = []

for user in test_users:
    user_test = test_data[test_data['userId'] == user]
    true_ratings = []
    pred_ratings = []
    if user not in pred_df.index:
        continue  # user not in training set
    for movie_id in user_test['movieId']:
        true_rating = user_test[user_test['movieId'] == movie_id]['rating'].values[0]
        true_ratings.append(true_rating)
        if movie_id in pred_df.columns:
            pred_rating = pred_df.at[user, movie_id]
        else:
            pred_rating = 0  # cold start movie
        pred_ratings.append(pred_rating)
    if len(true_ratings) > 1:
        ndcg_scores.append(ndcg_score([true_ratings], [pred_ratings]))

if ndcg_scores:
    avg_ndcg = np.mean(ndcg_scores)
else:
    avg_ndcg = 0.0

print(f"\nAverage NDCG score on test data: {avg_ndcg:.4f}")


Downloading MovieLens 100k dataset...
Extracting...
Done!
Movies dataset shape: (1682, 24)
Ratings dataset shape: (100000, 4)
Example similar movies to 'Toy Story (1995)':
[('Aladdin and the King of Thieves (1996)', np.float64(1.0000000000000002)), ('Aladdin (1992)', np.float64(0.8660254037844388)), ('Goofy Movie, A (1995)', np.float64(0.8660254037844388)), ('Santa Clause, The (1994)', np.float64(0.816496580927726)), ('Home Alone (1990)', np.float64(0.816496580927726))]

SVD Recommendations for User 1:
[('Trainspotting (1996)', 3.58499270571421), ('Leaving Las Vegas (1995)', 3.4292645715416903), ('Heathers (1989)', 3.386705696541518), ('Blues Brothers, The (1980)', 3.2973188919510976), ('Rosencrantz and Guildenstern Are Dead (1990)', 3.055549454436237)]

Hybrid SVD-KNN similar movies to 'Toy Story (1995)':
[('Willy Wonka and the Chocolate Factory (1971)', np.float64(0.6106061925529391)), ('101 Dalmatians (1996)', np.float64(0.6045771872047642)), ('In the Line of Duty 2 (1987)', np.floa