# **Dataset**

[https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system](https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system)

# **Importing necessary libraries**

Loads essential libraries (pandas, numpy, scipy.sparse, sklearn.neighbors, pickle) for data manipulation, matrix operations, and model training.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import pickle

# **Reading movie and rating datasets**

Loads movies.csv and ratings.csv into Pandas DataFrames.

In [None]:
movies = pd.read_csv('C:/Users/Wasseem/Desktop/project/movie recommendor/movies.csv')
rating = pd.read_csv('C:/Users/Wasseem/Desktop/project/movie recommendor/ratings.csv')

# **Exploring the movies dataset**

Displays basic info, shape, and a sample movie title to understand the dataset.

# **Exploring the ratings dataset**

Checks for null values, dataset shape, and unique users in the ratings dataset.

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [None]:
movies.iloc[5000]['title']

'Crossroads (2002)'

In [None]:
movies.shape

(62423, 3)

In [None]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [None]:
null_counts = rating.isnull().sum()
null_counts

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [None]:
rating['userId'].value_counts()

userId
72315     32202
80974      9178
137293     8913
33844      7919
20055      7488
          ...  
12094        20
119539       20
156759       20
12084        20
36207        20
Name: count, Length: 162541, dtype: int64

In [None]:
rating['userId'].unique().shape

(162541,)

# **Filtering active users**

Retains users with more than 200 ratings to focus on active users.


In [None]:
user_counts = rating['userId'].value_counts()
active_users = user_counts[user_counts > 200].index
rating = rating[rating['userId'].isin(active_users)]

In [None]:
active_users

Index([ 72315,  80974, 137293,  33844,  20055, 109731,  92046,  49403,  30879,
       115102,
       ...
        92476, 101711, 106680, 106790,  96099, 156456, 115756,   6761,  68693,
       145404],
      dtype='int64', name='userId', length=32848)

In [None]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
254,3,1,4.0,1439472215
255,3,29,4.5,1484754967
256,3,32,4.5,1439474635
257,3,50,5.0,1439474391
258,3,111,4.0,1484753849


In [None]:
rating = rating[['userId','movieId', 'rating']]
rating.head()

Unnamed: 0,userId,movieId,rating
254,3,1,4.0
255,3,29,4.5
256,3,32,4.5
257,3,50,5.0
258,3,111,4.0


# **Merging ratings with movies**

Combines ratings with movie titles using movieId for better readability.

In [None]:
ratings_with_movies = rating.merge(movies, on='movieId')
ratings_with_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,3,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,3,29,4.5,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,3,32,4.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,3,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
4,3,111,4.0,Taxi Driver (1976),Crime|Drama|Thriller


In [None]:
ratings_with_movies.shape

(16063558, 5)

In [None]:
number_rating = ratings_with_movies.groupby('movieId')['rating'].count().reset_index()
number_rating.rename(columns={'rating': 'num_of_rating'}, inplace=True)

In [None]:
final_ratings_with_movies = ratings_with_movies.merge(number_rating, on='movieId')
final_ratings_with_movies.head()

Unnamed: 0,userId,movieId,rating,title,genres,num_of_rating
0,3,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,22843
1,3,29,4.5,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,4706
2,3,32,4.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,19421
3,3,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,21007
4,3,111,4.0,Taxi Driver (1976),Crime|Drama|Thriller,14437


In [None]:
final_ratings_with_movies.shape

(16063558, 6)

# **Filtering movies with at least 5000 ratings**

Ensures recommendations are based on well-rated movies.

In [None]:
final_ratings_with_movies = final_ratings_with_movies[final_ratings_with_movies['num_of_rating'] >= 5000]
final_ratings_with_movies.shape

(7653202, 6)

In [None]:
final_ratings_with_movies.duplicated().sum()

0

# **Creating a pivot table**

Converts the dataset into a matrix where rows are movies, columns are users, and values are ratings (filling missing values with 0).

In [None]:
movies_pivot = final_ratings_with_movies.pivot_table(columns='userId', index='title', values= 'rating', fill_value=0)

In [None]:
movies_pivot

userId,3,4,12,13,19,23,31,38,43,57,...,162507,162508,162512,162516,162519,162521,162524,162529,162533,162534
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(500) Days of Summer (2009),0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.5,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Things I Hate About You (1999),0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,3.0,5.0,3.5,0.0,0.0,3.0,0.0,2.5,0.0
101 Dalmatians (One Hundred and One Dalmatians) (1961),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0,0.0,4.0,...,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,4.5
2001: A Space Odyssey (1968),5.0,4.0,0.0,3.5,5.0,4.0,1.0,0.0,3.5,0.0,...,2.0,0.0,4.0,4.5,5.0,4.0,0.0,5.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You've Got Mail (1998),0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0
Young Frankenstein (1974),0.0,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,4.0,3.0,0.0,0.0,5.0,0.0,4.0
Zodiac (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,3.5,3.0
Zombieland (2009),4.0,0.0,0.0,4.0,0.0,0.0,0.0,3.5,0.0,0.0,...,0.0,3.0,3.0,3.0,0.0,4.5,0.0,0.0,3.5,0.0


# **Converting the pivot table to a sparse matrix**

Uses csr_matrix to optimize memory usage for Nearest Neighbors computation.

In [None]:
movie__pivot_sparse = csr_matrix(movies_pivot)

# **Training a Nearest Neighbors model**

Initializes and fits a NearestNeighbors model using a brute-force algorithm.

In [None]:
movie_recommindor = NearestNeighbors(algorithm= 'brute')
movie_recommindor.fit(movie__pivot_sparse)

## **Finding similar movies for "Spider-Man (2002)"**

Retrieves recommendations by computing nearest neighbors based on user rating patterns.

## **Finding similar movies for "Batman Begins (2005)"**

Repeats the recommendation process for a different movie.

In [None]:
movie_name = "Spider-Man (2002)"
movie_index = movies_pivot.index.get_loc(movie_name)
movie_index

669

In [None]:
distance, suggestion = movie_recommindor.kneighbors(movies_pivot.iloc[669,:].values.reshape(1,-1), n_neighbors=6 )


In [None]:
distance

array([[  0.        , 291.03693924, 332.73788483, 333.58619576,
        342.36019044, 345.56620205]])

In [None]:
suggestion

array([[669, 670, 796, 801, 685, 557]], dtype=int64)

In [None]:
for i in range(len(suggestion)):
    print(movies_pivot.index[suggestion[i]])


Index(['Spider-Man (2002)', 'Spider-Man 2 (2004)', 'X-Men (2000)',
       'X2: X-Men United (2003)',
       'Star Wars: Episode II - Attack of the Clones (2002)',
       'Pirates of the Caribbean: The Curse of the Black Pearl (2003)'],
      dtype='object', name='title')


In [None]:
movie_name = "Batman Begins (2005)"  # Replace with the movie title you are looking for
movie_index = movies_pivot.index.get_loc(movie_name)
movie_index

70

In [None]:
distance, suggestion = movie_recommindor.kneighbors(movies_pivot.iloc[70,:].values.reshape(1,-1), n_neighbors=6 )

In [None]:
distance

array([[  0.        , 318.77382264, 341.76161282, 344.11662267,
        354.11615326, 355.77907471]])

In [None]:
for i in range(len(suggestion)):
    print(movies_pivot.index[suggestion[i]])

Index(['Batman Begins (2005)', 'Dark Knight, The (2008)', 'Iron Man (2008)',
       'V for Vendetta (2006)', 'Casino Royale (2006)', 'Spider-Man 2 (2004)'],
      dtype='object', name='title')


# **Defining a function for movie recommendations**

movie_recommindation(movie_name, number_of_movies) finds and prints recommendations for a given movie.

In [None]:
def movie_recommindation(movie_name,number_of_movies):
    movie_id = np.where(movies_pivot.index == movie_name)[0][0]
    _ , suggestion = movie_recommindor.kneighbors(movies_pivot.iloc[movie_id,:].values.reshape(1,-1), n_neighbors=number_of_movies )

    for i in range(len(suggestion)):
            movies = movies_pivot.index[suggestion[i]]
            for i,movie in enumerate(movies):
                if i == 0:
                    print(f"You selected '{movie_name}'\n")
                    print("The suggestion movies are: \n")
                else:
                    print(movie)

# **Testing the recommendation function**

Calls the function for "Batman Begins (2005)" and "Spider-Man (2002)".

In [None]:
movie_name = "Batman Begins (2005)"
movie_recommindation(movie_name,6)

You selected 'Batman Begins (2005)'

The suggestion movies are: 

Dark Knight, The (2008)
Iron Man (2008)
V for Vendetta (2006)
Casino Royale (2006)
Spider-Man 2 (2004)


In [None]:
movie_name = "Spider-Man (2002)"
movie_recommindation(movie_name,7)

You selected 'Spider-Man (2002)'

The suggestion movies are: 

Spider-Man 2 (2004)
X-Men (2000)
X2: X-Men United (2003)
Star Wars: Episode II - Attack of the Clones (2002)
Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Matrix Reloaded, The (2003)


# **Saving models and processed data using pickle**

Serializes and saves the trained model, final ratings dataset, pivot table, and movie names for later use.

In [None]:
pickle.dump(movie_recommindor,open('C:/Users/Wasseem/Desktop/project/movie recommendor/saved/model.pkl','wb'))
pickle.dump(final_ratings_with_movies,open('C:/Users/Wasseem/Desktop/project/movie recommendor/saved/final_rating.pkl','wb'))
pickle.dump(movies_pivot,open('C:/Users/Wasseem/Desktop/project/movie recommendor/saved/movies_pivot.pkl','wb'))

In [None]:
movie_names = movies_pivot.index

In [None]:
movie_names[70]

'Batman Begins (2005)'

In [None]:
pickle.dump(movie_names,open('C:/Users/Wasseem/Desktop/project/movie recommendor/saved/movie_names.pkl','wb'))