In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
import ipywidgets as widgets
from IPython.display import display

In [3]:
movies = pd.read_csv('data/movies.csv')

In [5]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


Features of this dataframe - **movieid, title & genre**

## Clean the movie title - Remove special characters

In [6]:
#Creating function to remove special characters from movie names. This function will remove characters which are not a-z or A-Z or 0-9
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","", title)

In [7]:
#The cleaned title is then added to a new column
movies["clean_title"] = movies["title"].apply(clean_title)

In [8]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


## Creating TFIDF Vectorizer

* TF (Term Frequency) - No of times a term appears in a particular document

* IDF (Inverse Document Frequency) - It measures how common or how rare the occurance of the word is in all the documents

 * log ((total number of documents) / (No of documents in which that term occurs))
 
TF-IDF is calculated by using the product of TF and IDF. 
<br>
If a term occurs very frequently in the documents then the IDF term becomes 0 (Since log (n/n) = 0) 


In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [10]:
tfidf.shape

(62423, 170073)

There are **62423 reviews** and **170073 unique words** in all the reviews. 

## Searching for titles similar to the one given as a parameter

<br>
* Clean the parameter/title - Remove special characters <br>
* Create vectorizer for that particular title <br>
* Find the cosine similarity score between the query term and other titles (Higher score -> Better match) <br>
* Take the top 5 results <br>

In [11]:
def search_title(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [12]:
search_title("toy Story 1995")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


The above tables returns top 5 titles which have higher cosine similarity score with **Toy Story 1995**

## Input and output widgets for notebook

In [13]:
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search_title(title))
            
movie_input.observe(on_type, names="value")

In [14]:
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

## Till now we have implementated recommendations based on the title given as input

## Now we'll make a system to give recommendations based on movie rating given by users

Importing the ratings for all movies given by all the users.

In [15]:
rating = pd.read_csv('data/ratings.csv')
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [16]:
rating.userId.nunique()

162541

In [17]:
rating.movieId.nunique()

59047

Total unique users who have given ratings - 162541 <br>
Total unique movies which have received ratings - 59047

In [18]:
movie_id = 1

**Finding users who've liked the same movie** <br>

From the ratings data we find the users who have given ratings for a particular movie id and the rating is equal to 5. We are extracting the userid of users who match the above condition.

In [19]:
#Finding users who liked the same movie
similar_users = rating[(rating["movieId"] == movie_id) &  (rating["rating"] > 4)]["userId"].unique()

This is the list of users who've watched movie with movie id = 1 and given it a rating of 5.

In [20]:
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

Find other movies which were also liked by people who lived the movie with id = 1 in this case. Given a rating greater than 4.

In [21]:
similar_users_recs = rating[(rating["userId"].isin(similar_users)) & (rating["rating"] > 4)]["movieId"]

In [22]:
similar_users_recs

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

Finding the movies which **atleast 10%** users similar to us also liked 

In [23]:
similar_users_recs = similar_users_recs.value_counts() / len(similar_users)

similar_users_recs = similar_users_recs[similar_users_recs > 0.1]

In [120]:
similar_users_recs

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

* 100% of users similar to us liked movie with id = 1. <br>
* 41.4% of users similar to us liked movie with id = 318. <br>
* 40.4% of users similar to us liked movie with id = 260. <br>

There are 92 movies which atleast 10% of users similar to us have rated greater than 4.

List of all the uses who have rated the movies more than 4, which similar users have also rated more than 4.

In [27]:
all_users = rating[(rating["movieId"].isin(similar_users_recs.index)) & (rating['rating']>4)]

In [26]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


What percentage of users have rated each movie which similar users have liked.

In [30]:
all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

In [31]:
all_users_recs

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

In [32]:
rec_percentage = pd.concat([similar_users_recs, all_users_recs], axis=1)
rec_percentage.columns = ['Similar Users', 'All Users']

In [33]:
rec_percentage['Score'] = rec_percentage['Similar Users'] / rec_percentage['All Users']

In [34]:
rec_percentage = rec_percentage.sort_values("Score", ascending=False)

In [36]:
rec_percentage

Unnamed: 0,Similar Users,All Users,Score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [37]:
rec_percentage.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,Similar Users,All Users,Score,movieId,title,genres,clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


In [39]:
rec_percentage

Unnamed: 0,Similar Users,All Users,Score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [42]:
def find_similar_movies(movie_id):
    similar_users = rating[(rating["movieId"] == movie_id) &  (rating["rating"] > 4)]["userId"].unique()
    similar_users_recs = rating[(rating["userId"].isin(similar_users)) & (rating["rating"] > 4)]["movieId"]
    
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs > 0.1]
    
    all_users = rating[(rating["movieId"].isin(similar_users_recs.index)) & (rating['rating']>4)]
    all_users_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
    
    rec_percentage = pd.concat([similar_users_recs, all_users_recs], axis=1)
    rec_percentage.columns = ['Similar Users', 'All Users']
    
    rec_percentage['Score'] = rec_percentage['Similar Users'] / rec_percentage['All Users']
    
    rec_percentage = rec_percentage.sort_values("Score", ascending=False)
    
    return rec_percentage.head(10).merge(movies, left_index=True, right_on="movieId")[["Score","title","genres"]]

In [43]:
movie_input_name = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search_title(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()