In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

In [2]:
# Load ratings (userId, movieId, rating, timestamp)
ratings = pd.read_csv(
    "ml-100k/u.data",
    sep='\t',                  # Tab-separated
    header=None,               # No header row
    names=['userId', 'movieId', 'rating', 'timestamp']  # Assign column names
)

print(ratings.head())

   userId  movieId  rating  timestamp
0     196      242       3  881250949
1     186      302       3  891717742
2      22      377       1  878887116
3     244       51       2  880606923
4     166      346       1  886397596


In [3]:
# Load movies (movieId, title, genres)
movies = pd.read_csv(
    "ml-100k/u.item",
    sep='|',                   # Pipe-separated
    encoding='latin-1',        # Fixes special chars in titles
    header=None,               # No header row
    usecols=[0, 1, 2],         # Only keep movieId, title, release date
    names=['movieId', 'title', 'release_date']  # Assign column names
)

In [4]:
print(movies.head())

   movieId              title release_date
0        1   Toy Story (1995)  01-Jan-1995
1        2   GoldenEye (1995)  01-Jan-1995
2        3  Four Rooms (1995)  01-Jan-1995
3        4  Get Shorty (1995)  01-Jan-1995
4        5     Copycat (1995)  01-Jan-1995


In [5]:
movies['title'] = movies['title'].str.replace(r' \(\d{4}\)', '', regex=True)
# Example: "Toy Story (1995)" → "Toy Story"

In [6]:
ratings = ratings.drop(['timestamp'], axis=1)  # We don't need timestamps

In [7]:
user_item_matrix = ratings.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)  # Fill missing ratings with 0

print(user_item_matrix.head())

movieId  1     2     3     4     5     6     7     8     9     10    ...  \
userId                                                               ...   
1         5.0   3.0   4.0   3.0   3.0   5.0   4.0   1.0   5.0   3.0  ...   
2         4.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   2.0  ...   
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
4         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   
5         4.0   3.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   

movieId  1673  1674  1675  1676  1677  1678  1679  1680  1681  1682  
userId                                                               
1         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4         0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
5         0.0   0.0   0.0   0.0   0.0   0.0   0

In [8]:
# Content-Based (Genres)
# Note: For ml-100k, genres are in u.item columns 5-24 (binary flags)
movies_with_genres = pd.read_csv(
    "ml-100k/u.item",
    sep='|',
    encoding='latin-1',
    header=None
)
genres = movies_with_genres.iloc[:, 5:]  # Columns 5-24 are genres
content_sim = cosine_similarity(genres)

In [9]:
#Collaborative 
svd = TruncatedSVD(n_components=20)
matrix_svd = svd.fit_transform(user_item_matrix)
collab_sim = cosine_similarity(matrix_svd)

In [10]:
import pickle
pickle.dump({
    'movies': movies,
    'content_sim': content_sim,
    'collab_sim': collab_sim,
    'user_item_matrix': user_item_matrix,
    'svd': svd
}, open('models.pkl', 'wb'))