In [2]:
!pip install scikit-learn scipy numpy matplotlib

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
Collecting scipy
  Using cached scipy-1.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.6 MB)
Collecting matplotlib
  Using cached matplotlib-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
Collecting threadpoolctl>=3.1.0
  Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
Collecting pillow>=8
  Using cached pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86

In [3]:
import pandas as pd
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# File paths
RATINGS_FILE = "../data/processed_rate_entries.csv"
ITEM_SIMILARITY_MODEL = "./item_similarity.pkl"

In [4]:
df = pd.read_csv(RATINGS_FILE)
df

Unnamed: 0,timestamp,user_id,movie_title,rating
0,2025-02-28T03:36:48,265143,rare exports a christmas tale 2010,4
1,2025-02-28T03:36:49,284982,far away 2001,2
2,2025-02-28T03:36:49,301905,the princess and the frog 2009,4
3,2025-02-28T03:36:50,104416,soul assassin 2001,3
4,2025-02-28T03:36:51,251315,civil brand 2003,3
...,...,...,...,...
462959,2025-03-03T00:35:35,318940,the vanishing 1993,4
462960,2025-03-03T00:35:35,20815,up in arms 1944,4
462961,2025-03-03T00:35:35,290958,lucky 7 2003,3
462962,2025-03-03T00:35:35,220777,the corruptor 1999,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462964 entries, 0 to 462963
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   timestamp    462964 non-null  object
 1   user_id      462964 non-null  int64 
 2   movie_title  462964 non-null  object
 3   rating       462964 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 14.1+ MB


In [14]:
# Create a mapping of movie titles to indices
movie_mapping = {movie: idx for idx, movie in enumerate(df["movie_title"].unique())}
movie_mapping

{'rare exports a christmas tale 2010': 0,
 'far away 2001': 1,
 'the princess and the frog 2009': 2,
 'soul assassin 2001': 3,
 'civil brand 2003': 4,
 'the lord of the rings the return of the king 2003': 5,
 'logans run 1976': 6,
 'even angels eat beans 1973': 7,
 'interstellar 2014': 8,
 'star wars 1977': 9,
 'the goonies 1985': 10,
 'lagaan once upon a time in india 2001': 11,
 'raiders of the lost ark 1981': 12,
 'the offence 1973': 13,
 'mississippi mermaid 1969': 14,
 'wreck-it ralph 2012': 15,
 'american beauty 1999': 16,
 'the town 2010': 17,
 'a kiss before dying 1991': 18,
 'pirates of the caribbean the curse of the black pearl 2003': 19,
 'avatar 2009': 20,
 'overnight 2003': 21,
 'bells are ringing 1960': 22,
 'crouching tiger_ hidden dragon 2000': 23,
 'mountains of the moon 1990': 24,
 'the sheriff and the satellite kid 1979': 25,
 'ponyo 2008': 26,
 'brave 2012': 27,
 'whisper of the heart 1995': 28,
 'the girl who played with fire 2009': 29,
 'reckless 1984': 30,
 'ive 

In [15]:
# Create user-movie matrix in sparse format
user_ids = df["user_id"].astype("category").cat.codes
movie_ids = df["movie_title"].map(movie_mapping)

user_ids, movie_ids

(0         177977
 1         191263
 2         202651
 3          70219
 4         168645
            ...  
 462959    211383
 462960     14085
 462961    195321
 462962    148207
 462963     76461
 Length: 462964, dtype: int32,
 0             0
 1             1
 2             2
 3             3
 4             4
           ...  
 462959    10937
 462960    11675
 462961     2932
 462962    14308
 462963    18097
 Name: movie_title, Length: 462964, dtype: int64)

In [16]:
# Build a sparse matrix
ratings_sparse = csr_matrix((df["rating"], (user_ids, movie_ids)))
ratings_sparse

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 462603 stored elements and shape (212294, 26548)>

In [17]:
# Compute item-item similarity using Cosine Similarity
movie_similarity = cosine_similarity(ratings_sparse.T, dense_output=False)  # Keeps it sparse
movie_similarity

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 846968 stored elements and shape (26548, 26548)>

In [None]:
# Save model as a dictionary (movie title → similarity row)
# movie_sim_dict = {movie: movie_similarity[idx].toarray().flatten() for movie, idx in movie_mapping.items()}
# movie_sim_dict

# with open(ITEM_SIMILARITY_MODEL, "wb") as f:
#     pickle.dump(movie_sim_dict, f)

In [18]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Assuming df, ratings_sparse, and movie_mapping are already defined

# Create a reverse mapping from index to movie title
reverse_movie_mapping = {idx: movie for movie, idx in movie_mapping.items()}

# Fit NearestNeighbors model directly on sparse matrix
nn_model = NearestNeighbors(metric="cosine", algorithm="auto", n_jobs=-1)
nn_model.fit(ratings_sparse.T)  # Transposed sparse matrix

# Find top-k similar movies for each movie
k = 10  # Number of neighbors
distances, indices = nn_model.kneighbors(ratings_sparse.T, n_neighbors=k)

# Create similarity dictionary with only top-k neighbors
movie_sim_dict = {}
for i, movie in enumerate(movie_mapping.keys()):
    similar_movies = {}
    for j, idx in enumerate(indices[i]):
        if idx in reverse_movie_mapping:  # Check if the index exists in our mapping
            similar_movie = reverse_movie_mapping[idx]
            similarity = 1 - distances[i][j]  # Convert distance to similarity
            similar_movies[similar_movie] = similarity
    movie_sim_dict[movie] = similar_movies

# Now movie_sim_dict contains each movie and its top-k similar movies with similarity scores


In [19]:
movie_sim_dict

{'rare exports a christmas tale 2010': {'rare exports a christmas tale 2010': np.float64(0.9999999999999999),
  'the living idol 1957': np.float64(0.10053171088078183),
  'eternally yours 1939': np.float64(0.09284489000865015),
  'love the beast 2009': np.float64(0.08980174575841637),
  'love meetings 1965': np.float64(0.08167234800792311),
  'splinter 2008': np.float64(0.07184139660673305),
  'just cause 1995': np.float64(0.07092850823103869),
  'the king of escape 2009': np.float64(0.0651519950185655),
  'satans blood 1978': np.float64(0.06482541654590579),
  'manhattan 1979': np.float64(0.0580979275104041)},
 'far away 2001': {'far away 2001': np.float64(0.9999999999999999),
  'tales from the organ trade 2013': np.float64(0.14606543485462808),
  'safe house 2012': np.float64(0.06239927229006337),
  'lone star 1996': np.float64(0.05314835774019566),
  'a single girl 1995': np.float64(0.04674961983623438),
  'gasland 2010': np.float64(0.04001350683822158),
  'colossus the forbin proje

In [20]:
def get_movie_recommendations(movie_title, movie_sim_dict, n=5):
    if movie_title not in movie_sim_dict:
        return "Movie not found in the database."
    
    similar_movies = movie_sim_dict[movie_title]
    recommendations = sorted(similar_movies.items(), key=lambda x: x[1], reverse=True)[:n]
    
    return [movie for movie, similarity in recommendations]


In [None]:
model_filename = "knn.pkl"

# Save the movie similarity dictionary to a file
with open(model_filename, "wb") as file:
    pickle.dump(movie_sim_dict, file)

print(f"Model saved to {model_filename}")

In [22]:
# Example usage
movie_title = "faraway 2001"
recommendations = get_movie_recommendations(movie_title, movie_sim_dict)
print(f"Recommendations for {movie_title}:")
for i, movie in enumerate(recommendations, 1):
    print(f"{i}. {movie}")


Recommendations for faraway 2001:
1. M
2. o
3. v
4. i
5. e
6.  
7. n
8. o
9. t
10.  
11. f
12. o
13. u
14. n
15. d
16.  
17. i
18. n
19.  
20. t
21. h
22. e
23.  
24. d
25. a
26. t
27. a
28. b
29. a
30. s
31. e
32. .


In [26]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import pickle

def load_data():
    # Load your data here
    # Assume we have a DataFrame with columns: user_id, movie_id, rating
    return pd.read_csv(RATINGS_FILE)

def create_user_item_matrix(df):
    # Create a user-item matrix
    user_item_matrix = df.pivot_table(index='user_id', columns='movie_title', values='rating', aggfunc='mean').fillna(0)
    return user_item_matrix

def train_knn_model(user_item_matrix, n_neighbors=5):
    # Create a sparse matrix
    user_item_sparse = csr_matrix(user_item_matrix.values)
    
    # Train KNN model
    model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model.fit(user_item_sparse)
    
    return model, user_item_sparse

def get_movie_recommendations(user_id, model, user_item_matrix, user_item_sparse, n_recommendations=5):
    if user_id not in user_item_matrix.index:
        return "User not found in the database."
    
    # Find the user's index
    user_index = user_item_matrix.index.get_loc(user_id)
    
    # Find similar users
    distances, indices = model.kneighbors(user_item_sparse[user_index].reshape(1, -1), n_neighbors=n_recommendations+1)
    
    # Get the movies watched by similar users
    similar_users_movies = user_item_matrix.iloc[indices.flatten()[1:]]
    
    # Get movies the user hasn't watched
    user_movies = user_item_matrix.loc[user_id]
    unwatched_movies = user_movies[user_movies == 0].index
    
    # Calculate the average rating for unwatched movies among similar users
    recommendations = similar_users_movies[unwatched_movies].mean().sort_values(ascending=False)
    
    return recommendations.head(n_recommendations)

# Main execution
df = load_data()
user_item_matrix = create_user_item_matrix(df)
knn_model, user_item_sparse = train_knn_model(user_item_matrix)

# Example usage
user_id = 1  # Replace with an actual user ID from your dataset
recommendations = get_movie_recommendations(user_id, knn_model, user_item_matrix, user_item_sparse)
print(f"Recommendations for user {user_id}:")
print(recommendations)

# Save the model
model_data = {
    'knn_model': knn_model,
    'user_item_matrix': user_item_matrix,
    'user_item_sparse': user_item_sparse
}

with open('knn_movie_recommender.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Model saved successfully!")


  user_item_matrix = df.pivot_table(index='user_id', columns='movie_title', values='rating', aggfunc='mean').fillna(0)


MemoryError: Unable to allocate 42.0 GiB for an array with shape (212294, 26548) and data type float64

In [None]:
import pickle

# Load the model
with open('knn_movie_recommender.pkl', 'rb') as f:
    loaded_model_data = pickle.load(f)

knn_model = loaded_model_data['knn_model']
user_item_matrix = loaded_model_data['user_item_matrix']
user_item_sparse = loaded_model_data['user_item_sparse']

# Get recommendations for a user
user_id = 1  # Replace with an actual user ID
recommendations = get_movie_recommendations(user_id, knn_model, user_item_matrix, user_item_sparse)
print(f"Recommendations for user {user_id}:")
print(recommendations)