In [1]:
import pandas as pd
movies = pd.read_csv("movies.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
import re
def clean_title(title):
    return re.sub("[^A-Za-z0-9 ]", "", title)

In [5]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [6]:
movies.head(10)

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
5,6,Heat (1995),Action|Crime|Thriller,Heat 1995
6,7,Sabrina (1995),Comedy|Romance,Sabrina 1995
7,8,Tom and Huck (1995),Adventure|Children,Tom and Huck 1995
8,9,Sudden Death (1995),Action,Sudden Death 1995
9,10,GoldenEye (1995),Action|Adventure|Thriller,GoldenEye 1995


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [10]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [11]:
search("Toy Story (1995)")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [12]:
import ipywidgets as widgets
from IPython.display import display

In [13]:
movie_input = widgets.Text(
    value = "Spider-Man",
    description = "Movie Title: ",
    disabled = False
)

In [14]:
movie_input    #here we didn't print any output. Just taking the input.

Text(value='Spider-Man', description='Movie Title: ')

In [15]:
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title)>5:
            display(search(title))
            
movie_input.observe(on_type, names="value")
display(movie_input, movie_list)

Text(value='Spider-Man', description='Movie Title: ')

Output()

In [16]:
ratings = pd.read_csv('ratings.csv')

In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [18]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [19]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [20]:
movies.dtypes

movieId         int64
title          object
genres         object
clean_title    object
dtype: object

In [21]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

Movie_id is the id of the movie from user input. we'll check for all rows with that movie using "(ratings["movieId"]==movie_id)" command.

Then, from those rows, we'll filter the rows which have rating > 4 (liked the movie). command: ratings["rating"]>4
.
We need only "userId" values, so closing square bracket of the above command(in comment) is followed by ["userId"].

We may have redundant data in it. So we used unique(). [Similar to distinct in SQL]

But here, we won't have redundant data. Since, one user can give atmost one rating for a movie.(if the integrity constraints are maintained properly)

In [22]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533], dtype=int64)

In [23]:
len(similar_users)

18835

In [24]:
movie_id = 1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"]
len(similar_users)

18835

### We can see that both(with and without using unique()), we got the same length. (i.e,. one user having atmost one review for a movie) 

In [25]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]
# Now, the movies rated above 4 by the similar users that we got.

In [26]:
similar_user_recs

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5105,36,34,5.0,834413787
5111,36,110,5.0,834412999
5114,36,150,5.0,839928587
5127,36,260,5.0,857131062
...,...,...,...,...
24998854,162533,60069,4.5,1280919889
24998861,162533,67997,4.5,1280920712
24998876,162533,78499,4.5,1281405901
24998884,162533,81591,4.5,1297289876


In [27]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
similar_user_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

In [28]:
similar_user_recs.value_counts()
# No of similar_users(who rated >4 for our input movie) who liked some other movie.

1         18835
318        8393
260        7605
356        6973
296        6918
          ...  
128478        1
125125        1
119701        1
107563        1
7625          1
Name: movieId, Length: 19282, dtype: int64

In [29]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

Here, we're calculating percentage(0-1; if we want from 0-100, multiply by 100) of similar people who liked some other movie.
Then, we'll consider only some movies (which most similar people liked. i.e. maybe >30% of similar users liked). Here we're taking >10% (i.e >0.1) 

In [30]:
similar_user_recs

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

### We got movies which are liked by >10% of people who liked the input movie.<br><br> But there are some special cases. <br>Say for example Toy Story, most people like Toy story (irrespective of their favourite genre or something). <br>But we only wanted to consider the people of similar interest as the similar users but not everyone as they liked the Toy story(input/current movie). 

In [31]:
# We are going to calculate the total number of users who liked each movie that our similar users also liked.     ( and then use it to reduce.)
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [32]:
# Now, similar to similar_user_recs, we'll calculate the all_user_recs percentage for each movie liked by our similar_users
all_users_recs = all_users["movieId"].value_counts() /len(all_users["userId"].unique())

In [33]:
all_users_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [34]:
rec_percentages = pd.concat([similar_user_recs,all_users_recs], axis = 1)
rec_percentages.columns = ["similar_users", "all_users"]

In [35]:
rec_percentages

Unnamed: 0,similar_users,all_users
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [36]:
rec_percentages["score"] = rec_percentages["similar_users"]/rec_percentages["all_users"]
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [37]:
rec_percentages

Unnamed: 0,similar_users,all_users,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [38]:
rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")

Unnamed: 0,similar_users,all_users,score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [39]:
def find_similar_movies(movie_id):
    #finding users who rated >4 for our movie
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    #finding (userId, movieId(rated>4)) for all similar_users 
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"]>4)]["movieId"]
    
    #counting no of similar_users who rated particular movie. (movieId, #similar_users who rated >4) [#similar_users means, no of similar_users] 
    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    #reducing the movies to -> movies which > 10% of similar users liked
    similar_user_recs = similar_user_recs[similar_user_recs > 0.1]
    
    #percentage of an average user liking each movie listed in similar_user_recs
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"]>4)]
    all_users_recs = all_users["movieId"].value_counts() /len(all_users["userId"].unique())
    
    #creating a table to compare similar_user_recs and all_users_recs
    rec_percentages = pd.concat([similar_user_recs,all_users_recs], axis = 1)
    rec_percentages.columns = ["similar_users", "all_users"]
    
    #comparing the ratio of similar_users to an average user (similar_users : all_users)
    rec_percentages["score"] = rec_percentages["similar_users"]/rec_percentages["all_users"]
    
    #sorting the movies according to their score.
    #(higher score == more similar movie) [i.e. good recommendation -> the user may like(most probably)]
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    
    #returning the top 10 recommended movies.
    return rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")[["score", "title", "genres"]]

# Creating an Interactive Recommendation widget

In [40]:
movie_input_name = widgets.Text(
    value = "Toy Story",
    description = "Movie Title: ",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_input_name.observe(on_type, names = "value")
display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title: ')

Output()