In [51]:
import pandas as pd


In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
movies

Unnamed: 0,09),Adventure|Mystery|Sci-Fi
164037,The Candy Tangerine Man (1975),Action|Drama
164039,Return of the Atom (2015),Documentary
164041,The Piano Tuner (2011),Drama|Thriller
164043,Desculpe o Transtorno (2016),Comedy
164045,Daisy-Head Mayzie (1995),(no genres listed)
...,...,...
209157,We (2018),Drama
209159,Window of the Soul (2001),Documentary
209163,Bad Poems (2018),Comedy|Drama
209169,A Girl Thing (2001),(no genres listed)


In [4]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [7]:
movies["clean_title"] = movies ["09)"].apply(clean_title)

In [8]:
movies


Unnamed: 0,09),Adventure|Mystery|Sci-Fi,clean_title
164037,The Candy Tangerine Man (1975),Action|Drama,The Candy Tangerine Man 1975
164039,Return of the Atom (2015),Documentary,Return of the Atom 2015
164041,The Piano Tuner (2011),Drama|Thriller,The Piano Tuner 2011
164043,Desculpe o Transtorno (2016),Comedy,Desculpe o Transtorno 2016
164045,Daisy-Head Mayzie (1995),(no genres listed),DaisyHead Mayzie 1995
...,...,...,...
209157,We (2018),Drama,We 2018
209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [10]:
#Inverse Document Frequency
#Tf*Idf

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [57]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5) [-5:]
    results  = movies.iloc[indices] [::-1]

    return results

In [54]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
                     display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

<class 'ModuleNotFoundError'>: No module named 'ipywidgets'

In [58]:
ratings = pd.read_csv("ratings.csv")

In [60]:
ratings

Unnamed: 0,31776,714,3.0,1526670549
0,31776,733,5.0,1429821067
1,31776,736,3.0,1484618080
2,31776,743,2.0,1444758524
3,31776,750,4.0,1429819992
4,31776,761,2.0,1445016319
...,...,...,...,...
38988,32035,110,4.0,1063723756
38989,32035,111,4.5,1064454924
38990,32035,145,3.5,1063724548
38991,32035,163,4.0,1063723987


In [61]:
ratings.dtypes

31776           int64
714             int64
3.0           float64
1526670549      int64
dtype: object

In [66]:
similar_users = ratings[(ratings["714"] == 714) & (ratings["3.0"] >= 5)]["31776"].unique()

In [67]:
similar_users

array([32033], dtype=int64)

In [77]:
similar_users_recs = ratings[(ratings["31776"].isin(similar_users)) & (ratings["3.0"] > 4)]["714"]

In [78]:
similar_users_recs

38712       1
38715      21
38716      25
38717      32
38718      36
38719      52
38720      58
38721      62
38724      85
38726     123
38730     194
38734     232
38735     235
38738     265
38741     296
38742     300
38745     306
38746     307
38747     308
38748     318
38750     337
38760     471
38766     534
38768     538
38772     588
38774     595
38775     608
38781     714
38782     728
38783     750
38784     778
38786     908
38787     912
38793    1041
38796    1079
38797    1080
38799    1094
38800    1136
38801    1172
38803    1177
38806    1193
38808    1197
38810    1199
38811    1206
38812    1208
38818    1230
38821    1235
38822    1245
38823    1246
38824    1247
38826    1265
38827    1266
38832    1295
38833    1296
38834    1307
Name: 714, dtype: int64

In [84]:
similar_users_recs.value_counts() / len(similar_users)

similar_users_recs = similar_users_recs[similar_users_recs > .1]

In [85]:
similar_users_recs

38712       1
38715      21
38716      25
38717      32
38718      36
38719      52
38720      58
38721      62
38724      85
38726     123
38730     194
38734     232
38735     235
38738     265
38741     296
38742     300
38745     306
38746     307
38747     308
38748     318
38750     337
38760     471
38766     534
38768     538
38772     588
38774     595
38775     608
38781     714
38782     728
38783     750
38784     778
38786     908
38787     912
38793    1041
38796    1079
38797    1080
38799    1094
38800    1136
38801    1172
38803    1177
38806    1193
38808    1197
38810    1199
38811    1206
38812    1208
38818    1230
38821    1235
38822    1245
38823    1246
38824    1247
38826    1265
38827    1266
38832    1295
38833    1296
38834    1307
Name: 714, dtype: int64

In [90]:
all_users = ratings[(ratings["31776"].isin(similar_users_recs.index)) & (ratings["3.0"] > 4)]

In [91]:
all_users

Unnamed: 0,31776,714,3.0,1526670549


In [93]:
all_users_recs = all_users["714"].value_counts() / len(all_users["31776"].unique())

In [94]:
all_users_recs

Series([], Name: 714, dtype: float64)

In [97]:
rec_percentages = pd.concat([similar_users_recs, all_users_recs] , axis=1)
rec_percentages.columns = ["similar", "all"]

In [98]:
rec_percentages

Unnamed: 0,similar,all
38712,1,
38715,21,
38716,25,
38717,32,
38718,36,
38719,52,
38720,58,
38721,62,
38724,85,
38726,123,


In [103]:
rec_percentages["1526670549"] = rec_percentages["similar"] / rec_percentages["all"]

In [106]:
rec_percentages = rec_percentages.sort_values("1526670549", ascending=False)

In [107]:
rec_percentages

Unnamed: 0,similar,all,1526670549
38712,1,,
38715,21,,
38716,25,,
38717,32,,
38718,36,,
38719,52,,
38720,58,,
38721,62,,
38724,85,,
38726,123,,


In [120]:
rec_percentages.head(10).merge(movies, left_index = True, right_on="movieId")

<class 'KeyError'>: 'movieId'

In [119]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_users_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)
    similar_users_recs = similar_users_recs[similar_users_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_users_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending = False)
    return rec_percentages.head(10).merge(movies, left_index = True, right_on = "movieId")[["score", "title", "genres"]]

In [123]:
movie_name_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movie(movie_id))
            
movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

<class 'NameError'>: name 'widgets' is not defined