In [55]:
import numpy as np, pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [56]:
ratings = pd.read_csv('ratings.csv')
print(ratings.head())

   userId  movieId  rating   timestamp
0     877     4155       5  1651201566
1     305     7661       2  1639553712
2     381     8423       2  1610704432
3     208     6433       1  1650223767
4      47     7752       4  1663998365


In [57]:
movies = pd.read_csv('movies.csv')
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [58]:
num_users = ratings['userId'].nunique()
print(num_users)

999


In [59]:
# Frequency of ratings per user
user_rating_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_rating_freq.columns = ['userId', 'rating_counts']
print(user_rating_freq.head())

   userId  rating_counts
0       1            120
1       2            105
2       3             89
3       4            100
4       5            107


In [60]:
# Average movie ratings
mean_ratings = ratings.groupby('movieId')[['rating']].mean()
lowest = mean_ratings['rating'].idxmin()
movies.loc[movies['movieId'] == lowest]
highest = mean_ratings['rating'].idxmax()
movies.loc[movies['movieId'] == highest]
ratings[ratings['movieId'] == highest]
ratings[ratings['movieId'] == lowest]

movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [61]:
from scipy.sparse import csr_matrix

def movie_matrix(df):
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # Dictionaries to map user IDs and movie IDs to indices in the sparse matrix
    user_mapper = dict(zip(np.unique(df['userId']), list(range(N))))
    movie_mapper = dict(zip(np.unique(df['movieId']), list(range(M))))

    # Inverse of the above
    inv_user_mapper = dict(zip(list(range(N)), np.unique(df['userId'])))
    inv_movie_mapper = dict(zip(list(range(M)), np.unique(df['movieId'])))

    user_idx = [user_mapper[i] for i in df['userId']]
    movie_idx = [movie_mapper[i] for i in df['movieId']]

    mat = csr_matrix((df['rating'], (movie_idx, user_idx)), shape=(M, N))
    return mat, user_mapper, movie_mapper, inv_user_mapper, inv_movie_mapper

movie_mat, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = movie_matrix(ratings)

In [62]:
from sklearn.neighbors import NearestNeighbors

# Find movie similarity using KNN based on Cosine Similarity
def find_similar_movies(movieId, matrix, k, metric='cosine'):
    neighbour_ids = []

    if movieId not in movie_mapper:
        print(f"Movie ID {movieId} not found in movie_mapper")
        return []

    movie_idx = movie_mapper[movieId]
    movie_vec = matrix[movie_idx]
    k += 1
    KNN = NearestNeighbors(n_neighbors=k, metric=metric, algorithm='brute')
    KNN.fit(matrix)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = KNN.kneighbors(movie_vec, return_distance=False)

    for i in range(0, k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])

    neighbour_ids.pop(0)
    return neighbour_ids

In [63]:
def recommend_movies(userId, matrix, user_mapper, movie_mapper, inv_movie_mapper, k=10):
    df1 = ratings[ratings['userId'] == userId]
    movieId = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]
    titles = dict(zip(movies['movieId'], movies['title']))
    similar_movies = find_similar_movies(movieId, matrix, k)

    if movieId in titles:
        print(f"Based on movies like {titles[movieId]}, you may also like:")
    else:
        print(f"Movie title not found for movieId:", movieId)

    for i in similar_movies:
        if i in titles:
            print(titles[i])


In [65]:
user_id = 10
recommend_movies(user_id, movie_mat, user_mapper, movie_mapper, movie_inv_mapper, k=10)

Movie title not found for movieId: 9483
Owning Mahowny (2003)
Night of the Iguana, The (1964)
Motel Hell (1980)
Naqoyqatsi (2002)
So I Married an Axe Murderer (1993)
