# Movie Recommendation System V2
Advance system with more logical thinking

### Import all dependencies

In [26]:
import re
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### Data Preprocessing

In [27]:
#load movies dataset
movies = pd.read_csv("data/movies.csv")

In [28]:
# display movies dataset
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [29]:
# text clean function
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [30]:
# create a new column that contain clean title
movies["clean_title"] = movies["title"].apply(clean_title)

In [31]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


### Feature Extraction

In [32]:
# creating a tfidf matrix
vectorizer = TfidfVectorizer(ngram_range = (1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

### Search Function

In [33]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

### Search Box

In [34]:
# input label and box
movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title: ",
    desabled = False
)

# show search results
movie_list = widgets.Output()

# when type this will be called 
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        display(data)
        title = data["new"]
        if len(title) > 5:
            display(search(title))

# hook the ontype func with input
movie_input.observe(on_type, names = "value")

# show input and output
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

## Making more logical recommendations

### Movie Ratings Data

In [35]:
# load ratings dataset
ratings = pd.read_csv("data/ratings.csv")

In [36]:
# display ratings dataset
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [37]:
# show datatypes of ratings dataset
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

### Find Similar Users

In [38]:
movie_id = 5

In [39]:
# anyone who watched and rated that movie avobe 4
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
similar_users

array([    18,     75,    105, ..., 162388, 162407, 162510], dtype=int64)

In [40]:
# get ids of other movies they liked and rated above 4
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
similar_user_recs

3018            5
3019            7
3020           10
3021           11
3023           19
            ...  
24991092    33794
24991093    44191
24991094    45722
24991095    48516
24991096    49272
Name: movieId, Length: 362923, dtype: int64

In [41]:
# times each movie appears in dataset
similar_user_recs = similar_user_recs.value_counts()
similar_user_recs

movieId
5         3161
356       1513
1         1511
780       1412
62        1271
          ... 
72601        1
86286        1
94005        1
108575       1
3804         1
Name: count, Length: 13097, dtype: int64

In [42]:
# convert into %
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

# take the movies that appears more (> 10%)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

count
1    1.316356
2    0.499209
3    0.296109
4    0.200886
5    0.150902
7    0.109143
Name: count, dtype: float64

In [43]:
# how much popular these movies are (all people liked)
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
1459,9,2,5.0,859383142
3864,23,3,5.0,943135696
5101,36,1,5.0,857131378
7276,58,3,5.0,1337179995
9939,75,1,5.0,1537207651
...,...,...,...,...
24996419,162519,1,5.0,1000946439
24997459,162524,1,4.5,1072919304
24997758,162527,1,4.5,1301688215
24998300,162530,1,5.0,989808332


In [44]:
# convert into % 
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
all_users_recs

movieId
1    0.822094
2    0.116407
7    0.069748
3    0.050762
5    0.042250
4    0.007944
Name: count, dtype: float64

### Create Recommendation

In [45]:
# how much similar users liked them and all users liked them
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,1.316356,0.822094
2,0.499209,0.116407
3,0.296109,0.050762
4,0.200886,0.007944
5,0.150902,0.04225
7,0.109143,0.069748


In [46]:
# ratio similar and all users liked movies 
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

# sort all the values
rec_percentages = rec_percentages.sort_values("score", ascending = False)

rec_percentages

Unnamed: 0,similar,all,score
4,0.200886,0.007944,25.288431
3,0.296109,0.050762,5.833318
2,0.499209,0.116407,4.288481
5,0.150902,0.04225,3.571598
1,1.316356,0.822094,1.601222
7,0.109143,0.069748,1.564811


In [47]:
# take top 10 recommendations and add them to movies data to get the titles
rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
3,0.200886,0.007944,25.288431,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
2,0.296109,0.050762,5.833318,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
1,0.499209,0.116407,4.288481,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
4,0.150902,0.04225,3.571598,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
0,1.316356,0.822094,1.601222,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
6,0.109143,0.069748,1.564811,7,Sabrina (1995),Comedy|Romance,Sabrina 1995


### Putting all things together

In [48]:
# recommendation function
def find_similar_movies(movie_id):
    # anyone who watched and rated that movie avobe 4
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    # get ids of other movies they liked and rated above 4
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
    
    # convert into %
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    # take the movies that appears more (> 10%)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    # how much popular these movies are (all people liked)
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    # convert into % 
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # how much similar users liked them and all users liked them
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis = 1)
    rec_percentages.columns = ["similar", "all"]
    
    # ratio similar and all users liked movies 
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    # sort all the values
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    
    # take top 10 recommendations and add them to movies data to get the titles
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

In [49]:
# create widget to take input and show results and recommendations

# input field and label
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

# create output widget
recommandation_list = widgets.Output()

# function for output
def on_type(data):
    with recommandation_list:
        recommandation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

# make search box active on type
movie_name_input.observe(on_type, names = "value")

# display the widgets
display(movie_name_input, recommandation_list)

Text(value='Toy Story', description='Movie Title:')

Output()

### Demo

In [53]:
# recommendation for "Batman" (just to show without running codes)
results = search("Batman")
movie_id = results.iloc[0]["movieId"]
display(find_similar_movies(movie_id))

Unnamed: 0,score,title,genres
8614,1541.533981,Batman (1966),Action|Adventure|Comedy
5741,111.649168,Scanners (1981),Horror|Sci-Fi|Thriller
3120,93.492316,Batman: Mask of the Phantasm (1993),Animation|Children
7358,91.567474,Octopussy (1983),Action|Adventure|Thriller
2525,90.442064,Dick Tracy (1990),Action|Crime
1506,90.121672,Batman & Robin (1997),Action|Adventure|Fantasy|Thriller
7122,75.650137,Chitty Chitty Bang Bang (1968),Adventure|Children|Comedy|Fantasy|Musical
6072,73.586952,"Man Who Fell to Earth, The (1976)",Drama|Sci-Fi
3303,72.525602,"Muppets Take Manhattan, The (1984)",Children|Comedy|Musical
3829,69.470199,"Invisible Man, The (1933)",Horror|Sci-Fi
