In [163]:
# importing the required packages

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [164]:
rating_df = pd.read_csv('/content/rating.csv')

In [165]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [166]:
rating_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
8509062,58771,587,4.0,1996-07-05 21:00:21
8509063,58771,588,5.0,1996-07-05 20:54:01
8509064,58771,589,4.0,1996-07-05 20:58:12
8509065,58771,590,5.0,1996-07-05 20:53:23
8509066,58771,592,3.0,1996-07-05 20:53:20


In [167]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8509067 entries, 0 to 8509066
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 259.7+ MB


In [168]:
# keeping only the needed columns:
rating_df = rating_df[['userId','movieId','rating']]
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [169]:
print("DAta shape:",{rating_df.shape})

DAta shape: {(8509067, 3)}


In [170]:
# number of unique users
number_of_user = rating_df['userId'].nunique()
print("Number of unique users:",{number_of_user})

Number of unique users: {58771}


In [171]:
# to get the movie count

movie_count = rating_df['movieId'].nunique()
print("Number of unique movies:",{movie_count})

Number of unique movies: {22315}


In [172]:
# check for any issues in the data:

# check for missing values:
rating_df.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0


In [173]:
# since only 1 missing value is present dropping it:
rating_df.dropna(inplace=True)

In [174]:
rating_df.isnull().sum().sum()

0

In [175]:
# no need to check for duplicate because same user can rate the movies in the same rate

In [176]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [177]:
movie_data = pd.read_csv('/content/movie.csv')

In [178]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [179]:
# check for data shape
print("Data shape:",{movie_data.shape})

Data shape: {(27278, 3)}


In [180]:
movie_count = movie_data['movieId'].nunique()
print("Number of unique movies:",{movie_count})

Number of unique movies: {27278}


In [181]:
# data check:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [182]:
# check for missing values
movie_data.isnull().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [183]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [184]:
# for easy analysis keeping only the primary genre from a list of genre for row and assuming this will be genre with respect
# to movie_id

movie_data['genres'] = movie_data['genres'].str.split('|').str[0]
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
1,2,Jumanji (1995),Adventure
2,3,Grumpier Old Men (1995),Comedy
3,4,Waiting to Exhale (1995),Comedy
4,5,Father of the Bride Part II (1995),Comedy


In [185]:
movie_data.tail()

Unnamed: 0,movieId,title,genres
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)
27277,131262,Innocence (2014),Adventure


In [186]:
genre_types = movie_data['genres'].unique()
print(genre_types)

['Adventure' 'Comedy' 'Action' 'Drama' 'Crime' 'Children' 'Mystery'
 'Documentary' 'Animation' 'Thriller' 'Horror' 'Fantasy' 'Western'
 'Film-Noir' 'Romance' 'War' 'Sci-Fi' 'Musical' 'IMAX'
 '(no genres listed)']


In [187]:
# extracting only the movie names and removing the year using split function
movie_data['title'] = movie_data['title'].str.split('(').str[0]

In [188]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure
1,2,Jumanji,Adventure
2,3,Grumpier Old Men,Comedy
3,4,Waiting to Exhale,Comedy
4,5,Father of the Bride Part II,Comedy


In [189]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


## Find out the list of most popular and liked genre

In [190]:
df_merge = pd.merge(rating_df,movie_data,on='movieId')
df_merge

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji,Adventure
1,1,29,3.5,"City of Lost Children, The",Adventure
2,1,32,3.5,Twelve Monkeys,Mystery
3,1,47,3.5,Seven,Mystery
4,1,50,3.5,"Usual Suspects, The",Crime
...,...,...,...,...,...
8509062,58771,587,4.0,Ghost,Comedy
8509063,58771,588,5.0,Aladdin,Adventure
8509064,58771,589,4.0,Terminator 2: Judgment Day,Action
8509065,58771,590,5.0,Dances with Wolves,Adventure


In [191]:
#  Group by genre to calculate popularity
popular_genre = df_merge.groupby('genres')['rating'].agg(['count','mean'])
popular_genre

Unnamed: 0_level_0,count,mean
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
(no genres listed),104,3.254808
Action,2393236,3.444287
Adventure,818215,3.556188
Animation,185695,3.575414
Children,178416,3.216247
Comedy,2144466,3.456861
Crime,590137,3.850628
Documentary,88132,3.768512
Drama,1636925,3.666087
Fantasy,27871,3.508468


In [192]:
# keeping only those genres whose count is more than 100 and rating is more than 3(benchmark setting)
popular_genre = popular_genre[(popular_genre['count']>150) & (popular_genre['mean']>3)]

popular_genre_list = popular_genre.index.tolist()
print(popular_genre_list)

['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


In [193]:
df_merge[df_merge['genres'].isin(popular_genre_list)]

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji,Adventure
1,1,29,3.5,"City of Lost Children, The",Adventure
2,1,32,3.5,Twelve Monkeys,Mystery
3,1,47,3.5,Seven,Mystery
4,1,50,3.5,"Usual Suspects, The",Crime
...,...,...,...,...,...
8509062,58771,587,4.0,Ghost,Comedy
8509063,58771,588,5.0,Aladdin,Adventure
8509064,58771,589,4.0,Terminator 2: Judgment Day,Action
8509065,58771,590,5.0,Dances with Wolves,Adventure


In [194]:
# 2.
# Create Model that finds the best suited Movie for one
# user in every genre.

In [195]:
df_merge

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji,Adventure
1,1,29,3.5,"City of Lost Children, The",Adventure
2,1,32,3.5,Twelve Monkeys,Mystery
3,1,47,3.5,Seven,Mystery
4,1,50,3.5,"Usual Suspects, The",Crime
...,...,...,...,...,...
8509062,58771,587,4.0,Ghost,Comedy
8509063,58771,588,5.0,Aladdin,Adventure
8509064,58771,589,4.0,Terminator 2: Judgment Day,Action
8509065,58771,590,5.0,Dances with Wolves,Adventure


# Making the recommendation of best suited movie for user1 in 'Horror Genre'

In [196]:
# filter the data for user1
user_1 = df_merge[df_merge['userId']==1]
user_1

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji,Adventure
1,1,29,3.5,"City of Lost Children, The",Adventure
2,1,32,3.5,Twelve Monkeys,Mystery
3,1,47,3.5,Seven,Mystery
4,1,50,3.5,"Usual Suspects, The",Crime
...,...,...,...,...,...
170,1,8507,5.0,Freaks,Crime
171,1,8636,4.5,Spider-Man 2,Action
172,1,8690,3.5,Slaughterhouse-Five,Comedy
173,1,8961,4.0,"Incredibles, The",Action


In [197]:
# finding the best sutited mivie for user 1 in the Horror Genre

horror_genre = user_1[user_1['genres']=='Horror']
horror_genre

Unnamed: 0,userId,movieId,rating,title,genres
35,1,1214,4.0,Alien,Horror
44,1,1258,4.0,"Shining, The",Horror
53,1,1333,4.0,"Birds, The",Horror
54,1,1348,3.5,Nosferatu,Horror
55,1,1350,3.5,"Omen, The",Horror
66,1,1994,3.5,Poltergeist,Horror
67,1,1997,3.5,"Exorcist, The",Horror
83,1,2644,3.5,Dracula,Horror
85,1,2664,3.5,Invasion of the Body Snatchers,Horror
105,1,3476,3.5,Jacob's Ladder,Horror


In [198]:
# grouing the Horror genre based on its rating provided by the user1 and will recommend those movies which are highly rated
# for this genre

best_horror_movies = horror_genre.groupby('title')['rating'].mean().sort_values(ascending=False).reset_index()
best_horror_movies

Unnamed: 0,title,rating
0,Alien,4.0
1,"Birds, The",4.0
2,"Shining, The",4.0
3,Dracula,3.5
4,"Exorcist, The",3.5
5,Invasion of the Body Snatchers,3.5
6,Jacob's Ladder,3.5
7,Nosferatu,3.5
8,"Omen, The",3.5
9,Poltergeist,3.5


In [199]:
best_suited_movie = best_horror_movies['title'].iloc[0]

In [200]:
print(f"best horror movie for user 1 is:{best_suited_movie}")

best horror movie for user 1 is:Alien 


**NOTE: Though we have sucessfully found the best suited movie for user 1 in horror Genre , but this way of coding is not efficient. If we want to make the same recommendations for n number of users for n number of Genres This way is not good. So lets find the same thing using a function**

In [201]:
df_merge

Unnamed: 0,userId,movieId,rating,title,genres
0,1,2,3.5,Jumanji,Adventure
1,1,29,3.5,"City of Lost Children, The",Adventure
2,1,32,3.5,Twelve Monkeys,Mystery
3,1,47,3.5,Seven,Mystery
4,1,50,3.5,"Usual Suspects, The",Crime
...,...,...,...,...,...
8509062,58771,587,4.0,Ghost,Comedy
8509063,58771,588,5.0,Aladdin,Adventure
8509064,58771,589,4.0,Terminator 2: Judgment Day,Action
8509065,58771,590,5.0,Dances with Wolves,Adventure


In [202]:
def find_best_movie_for_user(user_id, df):
  # Filter the data for the given user and genre
  user_data =  df[df['userId']==user_id]
  recommendations = []

  for genre in popular_genre_list:
    genre_movies = user_data[user_data['genres']==genre] # Filter user's movies by genre
    if not genre_movies.empty:
      best_movies = genre_movies.loc[genre_movies['rating'].idxmax()] # for Getting the highest-rated movie
      recommendations.append((genre,best_movies['title'],best_movies['rating']))
    else:
      recommendations.append((genre,"No movies found",None))

  return pd.DataFrame(recommendations,columns=['Genre','Movie','Rating'])

# Test the function
user_recommendations = find_best_movie_for_user(1,df_merge)
print(user_recommendations)

          Genre                                              Movie  Rating
0        Action    Lord of the Rings: The Return of the King, The      5.0
1     Adventure  Lord of the Rings: The Fellowship of the Ring,...     5.0
2     Animation                                 Last Unicorn, The      4.0
3      Children                        E.T. the Extra-Terrestrial      4.0
4        Comedy                                            Clerks      4.0
5         Crime                                            Freaks      5.0
6   Documentary                                    No movies found     NaN
7         Drama  Interview with the Vampire: The Vampire Chroni...     4.0
8       Fantasy                                     Sleepy Hollow      4.0
9     Film-Noir                                    No movies found     NaN
10       Horror                                             Alien      4.0
11      Musical                                    No movies found     NaN
12      Mystery          

# 3.Find what Genre Movies have received the best and worst ratings based on  User Rating.
   

In [203]:
# calculate the average rating for each genre

genre_ratings = df_merge.groupby('genres')['rating'].mean().sort_values(ascending=False)

best_genre = genre_ratings.idxmax()
worst_genre = genre_ratings.idxmin()

print(f"Best-rated genre: {best_genre}, Average Rating: {genre_ratings[best_genre]:.2f}")
print(f"Worst-rated genre: {worst_genre}, Average Rating: {genre_ratings[worst_genre]:.2f}")

Best-rated genre: Film-Noir, Average Rating: 4.07
Worst-rated genre: IMAX, Average Rating: 2.41
