# 1. Reading the Dataset with Pandas

In [2]:
import pandas as pd

movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


# 2. Cleaning Movie Titles with Regex

In [4]:
import re

def title_cleaning(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [5]:
movies["title_cleaning"] = movies["title"].apply(title_cleaning)

In [6]:
movies

Unnamed: 0,movieId,title,genres,title_cleaning
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


# 3. Creating a TFIDF matrix

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["title_cleaning"])

# 4. Creating a SEARCH function

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = title_cleaning(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -8)[-8:]
    results = movies.iloc[indices][::-1]
    return results

# 5. Building an Interactive SEARCH BOX

In [10]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value ="Harry Potter",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))
            

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Harry Potter', description='Movie Title:')

Output()

# 6. Reading in Movie Ratings Dataset

In [11]:
ratings = pd.read_csv("ratings.csv")

In [12]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [13]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

# 7. Finding similar users who likes the same movie

In [28]:
movie_id = 1

In [29]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()

In [30]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [31]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [32]:
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [33]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .15]

In [34]:
similar_user_recs

1        1.000000
318      0.414556
260      0.404561
356      0.347253
296      0.342663
1196     0.316304
1198     0.304605
593      0.304235
527      0.299052
2571     0.296609
3114     0.295498
1210     0.268695
50       0.262180
1270     0.254183
858      0.250555
364      0.249149
110      0.244484
589      0.241448
588      0.233674
608      0.231601
480      0.229898
4993     0.229083
1197     0.225233
2028     0.213239
4886     0.210647
150      0.205242
2959     0.205020
47       0.203613
5952     0.202577
7153     0.202058
2762     0.200281
595      0.198949
457      0.195469
2858     0.191619
1136     0.191322
6377     0.191026
1291     0.179698
4306     0.178958
8961     0.169554
1265     0.166963
1214     0.165038
1036     0.163335
58559    0.160743
1240     0.160743
1097     0.158448
34       0.158226
32       0.157856
541      0.155560
919      0.153858
1193     0.153191
1221     0.150748
Name: movieId, dtype: float64

# 8. Finding how much all users like movies

In [35]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [36]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [37]:
all_users_recs

318      0.359607
296      0.299136
2571     0.256431
356      0.247218
593      0.237386
260      0.233496
2959     0.227727
527      0.226141
858      0.220727
50       0.210700
1196     0.197583
4993     0.197040
7153     0.181341
2858     0.176150
5952     0.174786
1198     0.172977
110      0.169045
58559    0.153910
47       0.151809
1210     0.151732
608      0.151628
589      0.141601
2028     0.138219
1221     0.133598
1        0.131065
1136     0.128706
1197     0.125387
1193     0.124830
2762     0.124308
541      0.120843
1270     0.120433
1214     0.106550
32       0.105388
480      0.104964
457      0.102758
1291     0.097156
150      0.095180
364      0.090121
1036     0.085354
1240     0.085215
1265     0.083942
4306     0.083517
6377     0.075939
4886     0.074408
8961     0.071931
588      0.070943
1097     0.064360
919      0.063998
595      0.063024
3114     0.056434
34       0.054883
Name: movieId, dtype: float64

# 9. Creating a Recommendation Score

In [38]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar","all"]

In [39]:
rec_percentages

Unnamed: 0,similar,all
1,1.0,0.131065
32,0.157856,0.105388
34,0.158226,0.054883
47,0.203613,0.151809
50,0.26218,0.2107
110,0.244484,0.169045
150,0.205242,0.09518
260,0.404561,0.233496
296,0.342663,0.299136
318,0.414556,0.359607


In [40]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [41]:
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [42]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.0,0.131065,7.629785
3114,0.295498,0.056434,5.23615
588,0.233674,0.070943,3.293828
595,0.198949,0.063024,3.156708
34,0.158226,0.054883,2.882995
4886,0.210647,0.074408,2.830961
364,0.249149,0.090121,2.764604
6377,0.191026,0.075939,2.515514
1097,0.158448,0.06436,2.461899
919,0.153858,0.063998,2.404089


In [43]:
rec_percentages.head(10).merge(movies, left_index=True, right_on = "movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,title_cleaning
0,1.0,0.131065,7.629785,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.056434,5.23615,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
580,0.233674,0.070943,3.293828,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.063024,3.156708,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.054883,2.882995,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.074408,2.830961,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
359,0.249149,0.090121,2.764604,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994
6258,0.191026,0.075939,2.515514,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
1070,0.158448,0.06436,2.461899,1097,E.T. the Extra-Terrestrial (1982),Children|Drama|Sci-Fi,ET the ExtraTerrestrial 1982
898,0.153858,0.063998,2.404089,919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical,Wizard of Oz The 1939


# 10. Building a Recommendation Function

In [44]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .18]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar","all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score",ascending=False)
    return rec_percentages.head(15).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

# 11. Creating an Interactive Recommendation Widget

In [45]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()