In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movie-recommendation-system/movies.csv
/kaggle/input/movie-recommendation-system/ratings.csv


# Data Loading

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies_df = pd.read_csv('/kaggle/input/movie-recommendation-system/movies.csv')
movies_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [24]:
#text preprocessing
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [25]:

movies_df['genres_list'] = movies_df['genres'].str.replace('|', ' ')
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# Drop the 'genres' and 'title' columns
movies_df = movies_df.drop(columns=['genres', 'title'])

# Display the first few rows of the cleaned data
movies_df.head()

Unnamed: 0,movieId,genres_list,clean_title
0,1,Adventure Animation Children Comedy Fantasy,Toy Story 1995
1,2,Adventure Children Fantasy,Jumanji 1995
2,3,Comedy Romance,Grumpier Old Men 1995
3,4,Comedy Drama Romance,Waiting to Exhale 1995
4,5,Comedy,Father of the Bride Part II 1995


In [26]:
ratings_df = pd.read_csv('/kaggle/input/movie-recommendation-system/ratings.csv')
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
5,1,1088,4.0,1147868495
6,1,1175,3.5,1147868826
7,1,1217,3.5,1147878326
8,1,1237,5.0,1147868839
9,1,1250,4.0,1147868414


In [27]:
#we would not be needing the timestamp column
ratings_df = ratings_df.drop(['timestamp'], axis=1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [28]:
#merge the two datasets together
combined_data = ratings_df.merge(movies_df, on='movieId')
combined_data.head()

Unnamed: 0,userId,movieId,rating,genres_list,clean_title
0,1,296,5.0,Comedy Crime Drama Thriller,Pulp Fiction 1994
1,1,306,3.5,Drama,Three Colors Red Trois couleurs Rouge 1994
2,1,307,5.0,Drama,Three Colors Blue Trois couleurs Bleu 1993
3,1,665,5.0,Comedy Drama War,Underground 1995
4,1,899,3.5,Comedy Musical Romance,Singin in the Rain 1952


# Generating the Top 10 Recommendation by Popularity 


In [34]:

# Calculate the rating count (popularity) for each movie
rating_count = combined_data.groupby('movieId').size().reset_index(name='popularity')

# Calculate the average rating for each movie
average_rating = combined_data.groupby('movieId')['rating'].mean().reset_index(name='average_rating')

# Merge the rating count and average rating with the movies data
popular_movies = movies_df.merge(rating_count, on='movieId').merge(average_rating, on='movieId')

# Filter movies with an average rating greater than 4
popular_high_rated_movies = popular_movies[popular_movies['average_rating'] >= 4]

# Sort the movies by rating count in descending order
popular_high_rated_movies = popular_high_rated_movies.sort_values('popularity', ascending=False)

# Select the top 10 most popular movies with an average rating greater than 4
top_10_high_rated_movies = popular_high_rated_movies.head(10)

# Display the top 10 movies with their title, popularity, and average rating
print("Top 10 Recommendations:\t")
print(top_10_high_rated_movies[['clean_title', 'popularity', 'average_rating']])

Top 10 Recommendations:	
                                            clean_title  popularity  \
351                                   Forrest Gump 1994       81491   
314                       Shawshank Redemption The 1994       81482   
292                                   Pulp Fiction 1994       79672   
585                       Silence of the Lambs The 1991       74127   
2480                                    Matrix The 1999       72674   
257               Star Wars Episode IV  A New Hope 1977       68717   
522                                Schindlers List 1993       60411   
108                                     Braveheart 1995       59184   
2867                                    Fight Club 1999       58773   
1166  Star Wars Episode V  The Empire Strikes Back 1980       57361   

      average_rating  
351         4.048011  
314         4.413576  
292         4.188912  
585         4.151342  
2480        4.154099  
257         4.120189  
522         4.247579  
108      

# Generating the Top 10 Recommendation by Title using TfidfVectorizer and Cosine Similarity
# 

In [33]:
#we use the TF-IDF matrix to search for top 10 movies with similar title.
vectorizer_title = TfidfVectorizer(ngram_range=(1,2))

tfidf_title = vectorizer_title.fit_transform(movies_df['clean_title'])

def search_by_title(title):
    title = clean_title(title)
    query_vec = vectorizer_title.transform([title])
    similarity = cosine_similarity(query_vec, tfidf_title).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_df.iloc[indices][::-1]
     # Drop duplicates to ensure unique results
    results = results.drop_duplicates(subset=['clean_title'])
    return results

movie_results = search_by_title("Toy Story")
print("Top 10 Recommendations:\t")
print(movie_results[['clean_title', 'genres_list']])

Top 10 Recommendations:	
                                  clean_title  \
3021                         Toy Story 2 1999   
14813                        Toy Story 3 2010   
0                              Toy Story 1995   
20497                Toy Story of Terror 2013   
59767                        Toy Story 4 2019   
24064         Toy Story That Time Forgot 2014   
22634          Toy Story Toons Small Fry 2011   
22633  Toy Story Toons Hawaiian Vacation 2011   
24062    Toy Story Toons Partysaurus Rex 2012   
4823                             Toy The 1982   

                                            genres_list  
3021        Adventure Animation Children Comedy Fantasy  
14813  Adventure Animation Children Comedy Fantasy IMAX  
0           Adventure Animation Children Comedy Fantasy  
20497                         Animation Children Comedy  
59767               Adventure Animation Children Comedy  
24064                                Animation Children  
22634       Adventure Animati

# Generating the Top 10 Recommendation by Genre using TfidfVectorizer and Cosine Similarity

In [32]:
vectorizer_genres = TfidfVectorizer(ngram_range=(1,2))

tfidf_genres = vectorizer_genres.fit_transform(movies_df['genres_list'])

def search_similar_genres(genres):
    query_vec = vectorizer_genres.transform([genres])
    similarity = cosine_similarity(query_vec, tfidf_genres).flatten()
    indices = np.argpartition(similarity, -10)[-10:]
    results = movies_df.iloc[indices][::-1]
    return results

gen = 'Adventure Comedy'
top10movies = search_similar_genres(gen)
print("Top 10 Recommendations:\t")
print(top10movies[['clean_title', 'genres_list']])

Top 10 Recommendations:	
                                             clean_title       genres_list
25659                          The Fuller Brush Man 1948  Adventure Comedy
2379                               Crocodile Dundee 1986  Adventure Comedy
21378                         Prisoner of Zenda The 1979  Adventure Comedy
37582                     Hunt for the Wilderpeople 2016  Adventure Comedy
5808      Ace of Aces aka Super Ace The As des as L 1982  Adventure Comedy
57227                                     Fools Day 2014  Adventure Comedy
3651                               Me Myself  Irene 2000  Adventure Comedy
53513                      Blondie Takes a Vacation 1939  Adventure Comedy
18675  Asterix  Obelix God Save Britannia Astrix et O...  Adventure Comedy
37581                                        Lusers 2015  Adventure Comedy
