In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#loading rating dataset
ratings = pd.read_csv("ratings.csv")
print(ratings.head())

   userId  movieId  rating  timestamp
0       1       17     4.0  944249077
1       1       25     1.0  944250228
2       1       29     2.0  943230976
3       1       30     5.0  944249077
4       1       32     5.0  943228858


In [3]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [4]:
# loading movie dataset
movies = pd.read_csv("movies.csv")
print(movies.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [5]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
# Split the genres and create dummy variables
genre_dummies = movies['genres'].str.get_dummies(sep='|')
movies_new= pd.concat([movies, genre_dummies], axis=1)
movies_new= movies_new.drop('genres', axis=1)
print(movies_new.head())


   movieId                               title  (no genres listed)  Action  \
0        1                    Toy Story (1995)                   0       0   
1        2                      Jumanji (1995)                   0       0   
2        3             Grumpier Old Men (1995)                   0       0   
3        4            Waiting to Exhale (1995)                   0       0   
4        5  Father of the Bride Part II (1995)                   0       0   

   Adventure  Animation  Children  Comedy  Crime  Documentary  ...  Film-Noir  \
0          1          1         1       1      0            0  ...          0   
1          1          0         1       0      0            0  ...          0   
2          0          0         0       1      0            0  ...          0   
3          0          0         0       1      0            0  ...          0   
4          0          0         0       1      0            0  ...          0   

   Horror  IMAX  Musical  Mystery  Romance  

In [7]:
# count of movies with no genres listed .
count_no_genres = (movies_new['(no genres listed)'] == 1).sum()
print(f"Number of  movies with no genres listed: {count_no_genres }")

Number of  movies with no genres listed: 7080


In [8]:
n_ratings = len(ratings)
n_movies = len(ratings['movieId'].unique())
n_users = len(ratings['userId'].unique())

print(f"Number of ratings: {n_ratings}")
print(f"Number of unique movieId's: {n_movies}")
print(f"Number of unique users: {n_users}")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print(f"Average ratings per movie: {round(n_ratings/n_movies, 2)}")

Number of ratings: 32000204
Number of unique movieId's: 84432
Number of unique users: 200948
Average ratings per user: 159.25
Average ratings per movie: 379.01


In [None]:
# Filter the DataFrame for the specific movieId and userId
rating_value = ratings.loc[(ratings['movieId'] == 10) & (ratings['userId'] == 1), 'rating']

# Get the value, if it exists
if not rating_value.empty:
    print(rating_value.values[0])  
else:
    print("No rating found for movieId 10 and userId 1.")

No rating found for movieId 10 and userId 1.


In [None]:
user_freq = ratings[['userId', 'movieId']].groupby(
    'userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
print(user_freq.head())


   userId  n_ratings
0       1        141
1       2         52
2       3        147
3       4         27
4       5         33


In [11]:
# Find Lowest and Highest rated movies:
mean_rating = ratings.groupby('movieId')[['rating']].mean()
print(mean_rating)


           rating
movieId          
1        3.897438
2        3.275758
3        3.139447
4        2.845331
5        3.059602
...           ...
292731   4.000000
292737   1.500000
292753   4.000000
292755   1.000000
292757   3.500000

[84432 rows x 1 columns]


In [12]:
# Lowest rated movies
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]
print(movies.loc[movies['movieId'] == lowest_rated])

# show number of people who rated movies rated movie lowest
ratings[ratings['movieId']==lowest_rated]
print(ratings[ratings['movieId']==lowest_rated])

      movieId            title genres
5694     5805  Besotted (2001)  Drama
          userId  movieId  rating   timestamp
7806435    48903     5805     0.5  1184971591
14341972   89686     5805     0.5  1092107651


In [13]:
# Highest rated movies
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]
print(movies.loc[movies['movieId'] == highest_rated])

# show number of people who rated movies rated movie highest
ratings[ratings['movieId']==highest_rated]
print(ratings[ratings['movieId']==highest_rated])

      movieId                       title       genres
9644    31945  Always a Bridesmaid (2000)  Documentary
          userId  movieId  rating   timestamp
14497441   90716    31945     5.0  1553102189


In [14]:
# Calculate Movie Statistics:
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()
print(movie_stats.head())

         count      mean
movieId                 
1        68997  3.897438
2        28904  3.275758
3        13134  3.139447
4         2806  2.845331
5        13154  3.059602


In [15]:
# create user-item matrix using scipy csr matrix :
from scipy.sparse import csr_matrix

def create_matrix(df):

    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())

    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))

    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))

    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))  #X sparse matrix (ROWS , COLUMNS , VALUES)
    # print(X)

    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(ratings)

In [16]:
# identitfy movie_id using  movie_inv_mapper 

movie_id = movie_inv_mapper[0]  # Get movieId for row index 0 
user_id = user_inv_mapper[9]     # Get userId for column index 9

print(f"User {user_id} rated Movie {movie_id} with a rating of 2.5.")

User 10 rated Movie 1 with a rating of 2.5.


In [17]:
# Find similar movies using KNN
from sklearn.neighbors import NearestNeighbors
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):

    neighbour_ids = []

    movie_ind = movie_mapper[movie_id] 
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids


movie_titles = dict(zip(movies['movieId'], movies['title']))

movie_id = 3  # changeable 

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Since you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

Since you watched Grumpier Old Men (1995)
Father of the Bride Part II (1995)
Twister (1996)
Nutty Professor, The (1996)
Mr. Holland's Opus (1995)
Executive Decision (1996)
Sabrina (1995)
Eraser (1996)
Birdcage, The (1996)
Broken Arrow (1996)
Phenomenon (1996)


In [18]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    df1 = ratings[ratings['userId'] == user_id]

    if df1.empty:
        print(f"User with ID {user_id} does not exist.")
        return

    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]

    movie_titles = dict(zip(movies['movieId'], movies['title']))

    similar_ids = find_similar_movies(movie_id, X, k)
    movie_title = movie_titles.get(movie_id, "Movie not found")

    if movie_title == "Movie not found":
        print(f"Movie with ID {movie_id} not found.")
        return

    print(f"Since you watched {movie_title}, you might also like:")
    for i in similar_ids:
        print(movie_titles.get(i, "Movie not found"))

In [19]:
user_id = 150  # changeable
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)

Since you watched City of Lost Children, The (Cité des enfants perdus, La) (1995), you might also like:
Delicatessen (1991)
Brazil (1985)
Dark City (1998)
Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
Pi (1998)
Blade Runner (1982)
Clockwork Orange, A (1971)
Trainspotting (1996)
Akira (1988)
Ghost in the Shell (Kôkaku kidôtai) (1995)


In [20]:
user_id = 415  # another user_id
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)

Since you watched Matrix, The (1999), you might also like:
Fight Club (1999)
Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Return of the King, The (2003)
Lord of the Rings: The Two Towers, The (2002)
Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode IV - A New Hope (1977)
Gladiator (2000)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Shawshank Redemption, The (1994)
Sixth Sense, The (1999)
