<a href="https://colab.research.google.com/github/PrabhatGhm7/Movie-Recommendation-Model/blob/main/Movie_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import re



In [3]:
import kagglehub

# Download latest version
movielens_20m_dataset_path = kagglehub.dataset_download('grouplens/movielens-20m-dataset')

# Specify the path to the desired CSV file within the dataset directory
ratings_file_path = movielens_20m_dataset_path + '/rating.csv'
movie_file_path = movielens_20m_dataset_path + '/movie.csv'


Downloading from https://www.kaggle.com/api/v1/datasets/download/grouplens/movielens-20m-dataset?dataset_version_number=1...


100%|██████████| 195M/195M [00:01<00:00, 103MB/s]

Extracting files...





In [4]:
df_rating = pd.read_csv(ratings_file_path)
df_movie = pd.read_csv(movie_file_path)


In [5]:
df_movie['title'] = df_movie['title'].apply(lambda x: re.sub(r'\s?\(\d{4}\)', '', x))


In [6]:
df_rating.shape

(20000263, 4)

In [7]:
def sanity_check(dataframe):
  print("******************Shape************************")
  print(dataframe.shape)

  print("******************Info************************")
  print(dataframe.info())

  print("********************null**********************")
  print(dataframe.isnull().sum())


  print("********************duplicate**********************")
  print(dataframe.duplicated().sum())


In [8]:
df_rating['rating'] = df_rating['rating'].astype(int)


In [9]:
df_rating.drop('timestamp',axis=1)

Unnamed: 0,userId,movieId,rating
0,1,2,3
1,1,29,3
2,1,32,3
3,1,47,3
4,1,50,3
...,...,...,...
20000258,138493,68954,4
20000259,138493,69526,4
20000260,138493,69644,3
20000261,138493,70286,5


In [10]:
sanity_check(df_movie)

******************Shape************************
(27278, 3)
******************Info************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB
None
********************null**********************
movieId    0
title      0
genres     0
dtype: int64
********************duplicate**********************
0


In [11]:
df_movie['genres'] = df_movie['genres'].str.split('|')
one_hot = df_movie['genres'].str.join('|').str.get_dummies()
df_movie = pd.concat([df_movie, one_hot],axis =1)



In [12]:
df_movie  = df_movie.drop('genres',axis =1 )

In [13]:
df_movie.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
movie_rating = df_rating.merge(df_movie , on = 'movieId')

In [15]:
movie_rating.drop('timestamp',axis=1,inplace=True)

In [16]:

# Filter the movie ratings dataset
movie_rating = movie_rating.iloc[:1000000,:]

# Count the number of ratings for each movie
movie_count = movie_rating['movieId'].value_counts()

In [17]:
# Filter movies with at least 50 ratings
filter_movie = movie_count[movie_count >= 50].index
movie_rating = movie_rating[movie_rating['movieId'].isin(filter_movie)]



In [18]:
# Reset movieId and userId to continuous integer indices for the matrix
movie_rating['movieId'] = movie_rating['movieId'].astype("category").cat.codes
movie_rating['userId'] = movie_rating['userId'].astype("category").cat.codes

In [19]:
# Create the movie-user matrix (csr_matrix)
movie_user_matrix = csr_matrix((movie_rating['rating'], (movie_rating['movieId'], movie_rating['userId'])))

In [20]:
# Compute cosine similarity
item_similarity = cosine_similarity(movie_user_matrix)

# Get the unique movie IDs after filtering
unique_movie_ids = movie_rating['movieId'].unique()

# Create the DataFrame using the unique movie IDs as indices and columns
item_similarity_df = pd.DataFrame(item_similarity, index=unique_movie_ids, columns=unique_movie_ids)


In [21]:
movie_rating

Unnamed: 0,userId,movieId,rating,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,1,3,Jumanji,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,28,3,"City of Lost Children, The (Cité des enfants p...",0,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
2,0,31,3,Twelve Monkeys (a.k.a. 12 Monkeys),0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0
3,0,42,3,Seven (a.k.a. Se7en),0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,0,44,3,"Usual Suspects, The",0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,6742,895,4,Men in Black (a.k.a. MIB),0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
999996,6742,896,3,Contact,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
999997,6742,897,3,G.I. Jane,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999998,6742,900,4,Cop Land,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [22]:
def get_recommendations_by_name(movie_name, similarity_matrix, movie_rating, top_n=5):
    """
    Gets movie recommendations based on the given movie name.

    Args:
        movie_name: The name of the movie to get recommendations for.
        similarity_matrix: The movie similarity matrix.
        movie_rating: The DataFrame containing movie ratings and titles.
        top_n: The number of top recommendations to return.

    Returns:
        A list of tuples containing the recommended movie titles and their similarity scores.
    """
    # Get the movie ID for the given movie name
    movie_id_row = movie_rating[movie_rating['title'] == movie_name]['movieId']

    # Check if the movie exists in the DataFrame
    if movie_id_row.empty:
        print(f"Movie '{movie_name}' not found in the dataset.")
        return []  # Return an empty list if movie not found

    movie_id = movie_id_row.values[0]

    # Get similarity scores for all movies with the given movie ID
    similar_movies = similarity_matrix[movie_id].sort_values(ascending=False)

    # Skip the first movie (itself) and get top N most similar movies
    top_similar_movies = similar_movies.iloc[1:top_n+1]

    # Get movie names corresponding to the top similar movie IDs
    recommended_movies = []
    for movie_id, score in top_similar_movies.items():
        movie_title = movie_rating[movie_rating['movieId'] == movie_id]['title'].values[0]
        recommended_movies.append((movie_title, score))

    return recommended_movies

In [23]:


recommendations = get_recommendations_by_name(movie_name="Thor",
                                              similarity_matrix=item_similarity_df,
                                              movie_rating=movie_rating,
                                              top_n=10)


for movie_title, score in recommendations:
    print(f"Movie Title: {movie_title}, Similarity Score: {score}")

Movie Title: National Lampoon's Van Wilder, Similarity Score: 0.42528362430378774
Movie Title: X-Men: First Class, Similarity Score: 0.4248980771713031
Movie Title: Rise of the Planet of the Apes, Similarity Score: 0.41746264653849835
Movie Title: Hangover Part II, The, Similarity Score: 0.4149249856179804
Movie Title: Beverly Hills Cop, Similarity Score: 0.40950927776706975
Movie Title: Beverly Hills Cop II, Similarity Score: 0.40805380897837795
Movie Title: Harry Potter and the Deathly Hallows: Part 2, Similarity Score: 0.4052124656796467
Movie Title: WarGames, Similarity Score: 0.3962121504732302
Movie Title: Triplets of Belleville, The (Les triplettes de Belleville), Similarity Score: 0.3910043401257703
Movie Title: City Slickers, Similarity Score: 0.3902134367938313
