In [1]:
data_path = "./data/Dataset.csv"
title_path = "./data/Movie_Id_Titles.csv"

In [3]:
# Load the data
import numpy as np
import pandas as pd

ratings_df = pd.read_csv(data_path)  
movies_df = pd.read_csv(title_path)

In [14]:
# Merge movies and ratings data
movie_data = pd.merge(ratings_df, movies_df, on='item_id')
movie_data.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [15]:
# Create a pivot table of user ratings
movie_ratings = movie_data.pivot_table(index='user_id', columns='title', values='rating')
movie_ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [16]:
# Fill missing values with 0 (assuming no rating means a rating of 0)
movie_ratings = movie_ratings.fillna(0)
movie_ratings.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
movie_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 944 entries, 0 to 943
Columns: 1664 entries, 'Til There Was You (1997) to Á köldum klaka (Cold Fever) (1994)
dtypes: float64(1664)
memory usage: 12.0 MB


In [18]:
# Convert the pivot table to a dense matrix
movie_ratings_matrix = movie_ratings.values

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Split the data into training and test sets
train_data, test_data = train_test_split(movie_ratings_matrix, test_size=0.2, random_state=42)

# Build a k-nearest neighbors model for collaborative filtering
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(train_data)

model_knn

In [22]:
# Function to get movie recommendations
def get_movie_recommendations(movie_title, num_recommendations=10):
    movie_id = movies_df.loc[movies_df['title'] == movie_title]['item_id'].values[0]
    distances, indices = model_knn.kneighbors(movie_ratings.iloc[movie_id - 1, :].values.reshape(1, -1), n_neighbors=num_recommendations + 1)
    recommended_movie_indices = [i + 1 for i in indices.flatten()]
    recommended_movies = [movies_df.loc[movies_df['item_id'] == idx]['title'].values[0] for idx in recommended_movie_indices][1:]
    return recommended_movies

# Example usage: Get recommendations for a movie
movie_title = "Toy Story (1995)"
recommendations = get_movie_recommendations(movie_title)
print(f"Recommended movies for '{movie_title}':")
for i, movie in enumerate(recommendations):
    print(f"{i + 1}. {movie}")

Recommended movies for 'Toy Story (1995)':
1. Monty Python and the Holy Grail (1974)
2. How to Be a Player (1997)
3. Clerks (1994)
4. Raging Bull (1980)
5. Son in Law (1993)
6. Bio-Dome (1996)
7. Hoop Dreams (1994)
8. Citizen Kane (1941)
9. French Twist (Gazon maudit) (1995)
10. Three Colors: Red (1994)


In [24]:
import joblib

joblib.dump(model_knn, "recommender.pkl")

['recommender.pkl']