In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movie-recommendation-system/rating.csv
/kaggle/input/movie-recommendation-system/link.csv
/kaggle/input/movie-recommendation-system/genome_tags.csv
/kaggle/input/movie-recommendation-system/tag.csv
/kaggle/input/movie-recommendation-system/movie.csv


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movie_df = pd.read_csv('/kaggle/input/movie-recommendation-system/movie.csv')
rating_df = pd.read_csv('/kaggle/input/movie-recommendation-system/rating.csv')

print(f"Movie df shape : {movie_df.shape}")
print(f"Rating df shape : {rating_df.shape}")

Movie df shape : (27278, 3)
Rating df shape : (20000263, 4)


In [4]:
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
movie_rating_df = movie_df.merge(rating_df, how="left", on="movieId")

print(f"Movie rating df : {movie_rating_df.shape}")

Movie rating df : (20000797, 6)


In [7]:
movie_rating_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41


In [8]:
#Finding the number of ratings for each movie

ratings_count_df = pd.DataFrame(movie_rating_df["title"].value_counts())

In [9]:
ratings_count_df.head()

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
Pulp Fiction (1994),67310
Forrest Gump (1994),66172
"Shawshank Redemption, The (1994)",63366
"Silence of the Lambs, The (1991)",63299
Jurassic Park (1993),59715


In [10]:
#Storing the average rating of each movie
average_rating_df = movie_rating_df.groupby(['title','movieId'])['rating'].mean().reset_index()

In [11]:
average_rating_df.head()

Unnamed: 0,title,movieId,rating
0,#chicagoGirl: The Social Network Takes on a Di...,111878,3.666667
1,$ (Dollars) (1971),85177,2.833333
2,$5 a Day (2008),80361,2.871795
3,$9.99 (2008),74486,3.009091
4,$ellebrity (Sellebrity) (2012),107501,2.0


In [12]:
# Separating the full set into rare and common movies - rare movies have total number of ratings less than 1000. 
rare_movies = ratings_count_df[ratings_count_df["count"] <= 1000].index

In [13]:
common_movies = movie_rating_df[~movie_rating_df["title"].isin(rare_movies)]

In [14]:
len(rare_movies)

24103

In [15]:
print(f"Total movies : {movie_rating_df['title'].nunique()}")
print(f"Rare movies : {len(rare_movies)}")
print(f"Common movies : {common_movies['title'].nunique()}")

Total movies : 27262
Rare movies : 24103
Common movies : 3159


In [16]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [17]:
#This table represents the ratings given for all the movies for all users. If there is a movies which has been rated, its marked as NaN 
user_movie_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,,,,,,,,,,,...,,,,,,,,,,
2.0,,,,,,,,,,,...,,,,,,,,,,
3.0,,,,,,,,,,,...,,,,,,,,,,
4.0,,,,,,,,,,,...,,,,,,,,,,
5.0,,,,,,,,,,,...,,,,,,,,,,


In [18]:
user_movie_df.shape

(138493, 3159)

We have 138493 users and 3159 movies to consider.

### Item based film suggestions

In [19]:
movie_to_consider = "Matrix, The (1999)"

In [20]:
movie_rating_by_user = user_movie_df[movie_to_consider]

In [21]:
#movie_rating_by_user
user_movie_df.corrwith(movie_rating_by_user).sort_values(ascending=False).head(10)

title
Matrix, The (1999)                                           1.000000
Matrix Reloaded, The (2003)                                  0.516906
Matrix Revolutions, The (2003)                               0.449588
Animatrix, The (2003)                                        0.367151
Blade (1998)                                                 0.334493
Terminator 2: Judgment Day (1991)                            0.333882
Minority Report (2002)                                       0.332434
Edge of Tomorrow (2014)                                      0.326762
Mission: Impossible (1996)                                   0.320815
Lord of the Rings: The Fellowship of the Ring, The (2001)    0.318726
dtype: float64

In [22]:
#Select a movie name randomly 
movie_to_consider = pd.Series(user_movie_df.columns).sample(1).values[0]

movie_rating_by_user = user_movie_df[movie_to_consider]

similar_movies = user_movie_df.corrwith(movie_rating_by_user).drop(index = movie_to_consider).sort_values(ascending=False).head(10)

print(f"SELECTED MOVIE : {movie_to_consider}")
print(f"SIMILAR MOVIES : {similar_movies}")

SELECTED MOVIE : Captain America: The Winter Soldier (2014)
SIMILAR MOVIES : title
Funny Farm (1988)                            0.636050
Amateur (1994)                               0.616188
Happy, Texas (1999)                          0.603688
Avengers, The (2012)                         0.602150
Captain America: The First Avenger (2011)    0.596595
Withnail & I (1987)                          0.591081
Thor (2011)                                  0.567944
X-Men: Days of Future Past (2014)            0.564051
Winslow Boy, The (1999)                      0.560011
Associate, The (1996)                        0.546994
dtype: float64


### User based suggestions

In [2]:
import pandas as pd

Read and prepare data

In [3]:
movie_df = pd.read_csv('/kaggle/input/movie-recommendation-system/movie.csv')
rating_df = pd.read_csv('/kaggle/input/movie-recommendation-system/rating.csv')

print(f"Movie df shape : {movie_df.shape}")
print(f"Rating df shape : {rating_df.shape}")

Movie df shape : (27278, 3)
Rating df shape : (20000263, 4)


In [4]:
movie_rating_df = movie_df.merge(rating_df, how="left", on="movieId")

print(f"Movie rating df : {movie_rating_df.shape}")

Movie rating df : (20000797, 6)


In [5]:
#Finding the number of ratings for each movie

ratings_count_df = pd.DataFrame(movie_rating_df["title"].value_counts())

In [6]:
# Separating the full set into rare and common movies - rare movies have total number of ratings less than 1000. 
rare_movies = ratings_count_df[ratings_count_df["count"] <= 1000].index

common_movies = movie_rating_df[~movie_rating_df["title"].isin(rare_movies)]


print(f"Total movies : {movie_rating_df['title'].nunique()}")
print(f"Rare movies : {len(rare_movies)}")
print(f"Common movies : {common_movies['title'].nunique()}")

Total movies : 27262
Rare movies : 24103
Common movies : 3159


In [7]:
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

In [8]:
pd.Series(user_movie_df.index).sample(1, random_state=12).astype(int)

88238    88239
Name: userId, dtype: int64

In [9]:
#random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=12))
random_user = 28941

print(f"Random user selected : {random_user}")

Random user selected : 28941


In [10]:
# Extracting the list of movies and ratings (if given) for the random user

random_user_movie_df = user_movie_df[user_movie_df.index == random_user]

In [11]:
random_user_movie_df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


title,"'burbs, The (1989)",(500) Days of Summer (2009),*batteries not included (1987),...And Justice for All (1979),10 Things I Hate About You (1999),"10,000 BC (2008)",101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),102 Dalmatians (2000),12 Angry Men (1957),...,Zero Dark Thirty (2012),Zero Effect (1998),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zulu (1964),[REC] (2007),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28941.0,,,,,,,,,,,...,,,,,,,,,,


In [12]:
random_user_movie_df.notna().any()

title
'burbs, The (1989)                   False
(500) Days of Summer (2009)          False
*batteries not included (1987)       False
...And Justice for All (1979)        False
10 Things I Hate About You (1999)    False
                                     ...  
Zulu (1964)                          False
[REC] (2007)                         False
eXistenZ (1999)                      False
xXx (2002)                           False
¡Three Amigos! (1986)                False
Length: 3159, dtype: bool

In [13]:
# If rating is not NaN, we consider that movie as watched movie

movies_watched_by_random_user = random_user_movie_df.columns[random_user_movie_df.notna().any()].tolist()

In [14]:
print(f"Movies watched by random user : {len(movies_watched_by_random_user)}")

Movies watched by random user : 33


In [15]:
"Silence of the Lambs, The (1991)" in movies_watched_by_random_user

True

In [16]:
#Fetching the rating of a given movie and a particular user

user_movie_df.loc[user_movie_df.index == random_user,
                  user_movie_df.columns == "Silence of the Lambs, The (1991)"]

title,"Silence of the Lambs, The (1991)"
userId,Unnamed: 1_level_1
28941.0,1.0


In [17]:
# Extracting all movies watched by random user

all_movies_by_random_user = user_movie_df[movies_watched_by_random_user]

In [18]:
# Stores the number of movies rated for each user

user_movie_count = all_movies_by_random_user.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]

In [19]:
user_movie_count.head()

Unnamed: 0,userId,movie_count
0,1.0,1
1,2.0,2
2,3.0,4
3,4.0,6
4,5.0,11


In [20]:
# Top 20 user with max movie rated
user_movie_count[user_movie_count["movie_count"] > 20].sort_values("movie_count", ascending=False).head(20)

Unnamed: 0,userId,movie_count
94230,94231.0,33
100398,100399.0,33
118204,118205.0,33
15918,15919.0,33
124051,124052.0,33
81217,81218.0,33
81595,81596.0,33
28940,28941.0,33
8404,8405.0,33
125911,125912.0,33


In [21]:
# Out of all similar users, the max no of movies rated by individual users
max_ratings_given = user_movie_count["movie_count"].max()
user_given_max_ratings = user_movie_count[user_movie_count["movie_count"] == max_ratings_given].shape[0]

print(f"Out of all similar users, the max no of movies rated by individual users : {max_ratings_given}")
print(f"{user_given_max_ratings} users have given maximum ratings")

Out of all similar users, the max no of movies rated by individual users : 33
17 users have given maximum ratings


In [22]:
#How many number of users have given the max number of ratings
user_movie_count[user_movie_count["movie_count"] == 33].shape[0]

17

In [29]:
# Selecting users which have rated min 60% of movies watched by the user

min_movies_rated = round(len(movies_watched_by_random_user) * 0.6)
print(f"Selecting users who have rated minimum {min_movies_rated} movies")

users_same_movies = user_movie_count[user_movie_count["movie_count"] > min_movies_rated]["userId"]
print(f"Users selected - {len(users_same_movies)}")

Selecting users who have rated minimum 20 movies
Users selected - 3202


Finding users similar to the user selected for generating recommendations

In [37]:
# We concat 2 dataframes 
# 1. df with ratings given to the movies watched by the random user. These ratings are given by the subset of users selected above who have watched min 60% movies of random user
# 2. df with ratings given by random user to the movies watched by self

final_df = pd.concat([all_movies_by_random_user[all_movies_by_random_user.index.isin(users_same_movies)],
                      random_user_movie_df[movies_watched_by_random_user]])

In [38]:
final_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


title,Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),Aladdin (1992),"American President, The (1995)",Apollo 13 (1995),Babe (1995),Bullets Over Broadway (1994),Clueless (1995),Disclosure (1994),Forrest Gump (1994),...,Ready to Wear (Pret-A-Porter) (1994),"Remains of the Day, The (1993)",Sabrina (1995),Schindler's List (1993),"Secret Garden, The (1993)",Sense and Sensibility (1995),Shadowlands (1993),"Silence of the Lambs, The (1991)",Star Trek: Generations (1994),Stargate (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
130.0,4.0,3.0,,3.0,3.0,,,3.0,5.0,5.0,...,,3.0,,5.0,,,3.0,5.0,,3.0
156.0,3.0,,,5.0,5.0,3.0,,,4.0,5.0,...,,,4.0,5.0,,4.0,4.0,5.0,3.0,4.0
158.0,2.0,1.0,4.0,4.0,3.0,5.0,,4.0,,5.0,...,,5.0,3.0,5.0,5.0,4.0,5.0,5.0,,
184.0,2.0,3.0,3.0,4.0,4.0,,3.0,,4.0,3.0,...,,4.0,4.0,5.0,4.0,,4.0,5.0,3.0,4.0
295.0,,,3.0,3.0,3.0,3.0,3.0,2.0,,4.0,...,,3.0,3.0,4.0,3.0,4.0,,4.0,3.0,2.0


In [47]:
# Finding correlation

#corr_df = final_df.T.corr().unstack().sort_values()
corr_df = final_df.T.corr()

In [1]:
corr_df

NameError: name 'corr_df' is not defined