In [1]:
import pandas as pd

In [2]:
movies = pd.read_csv("datasets/movies.csv")

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [4]:
import re
def clean_title(title):
    return re.sub("[^a-zA-z0-9 ]", "", title)

movies['cleaned_title'] = movies["title"].apply(clean_title)
    

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

tfidf = vectorizer.fit_transform(movies['title'])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
import numpy as np

In [9]:
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]

    return results

In [10]:
search("Avengers")

Unnamed: 0,movieId,title,genres,cleaned_title
34536,145676,3 Avengers (1964),(no genres listed),3 Avengers 1964
2063,2153,"Avengers, The (1998)",Action|Adventure,Avengers The 1998
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
40636,159920,Shaolin Avengers (1994),Action,Shaolin Avengers 1994
45394,170297,Ultimate Avengers 2 (2006),Action|Animation|Sci-Fi,Ultimate Avengers 2 2006


In [11]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title: ",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    print("Fired")
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        
        if len(title) > 2:   
            display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title: ')

Output()

In [12]:
ratings = pd.read_csv("datasets/ratings.csv")

In [13]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
1,1,306,3.5,1.147869e+09
2,1,307,5.0,1.147869e+09
3,1,665,5.0,1.147879e+09
4,1,899,3.5,1.147869e+09
...,...,...,...,...
15232122,98680,55765,2.0,1.437718e+09
15232123,98680,55805,5.0,1.437719e+09
15232124,98680,55820,5.0,1.437718e+09
15232125,98680,55908,1.5,1.437719e+09


In [14]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [36]:
movie_id = 137

In [37]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()

In [38]:
similar_users

array([ 3226,  5990,  6123,  8315, 14626, 15878, 15938, 16517, 17149,
       19143, 19649, 24974, 29574, 30267, 31827, 31887, 32505, 37754,
       40210, 43269, 44711, 47458, 47628, 48097, 49996, 52071, 53538,
       61141, 63503, 64415, 67456, 71351, 71362, 72641, 74979, 75421,
       80895, 83094, 85293, 87485, 87564, 89411, 89616, 92096, 96555,
       98118])

In [41]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings["rating"] >= 4)]['movieId']

In [42]:
similar_user_recs

470144         1
470145         5
470146         9
470147        10
470150        31
            ... 
15140314    1375
15140315    1376
15140317    1383
15140318    1384
15140319    2019
Name: movieId, Length: 9768, dtype: int64

In [46]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)

In [47]:
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [48]:
similar_user_recs

movieId
137     1.000000
608     0.478261
1       0.434783
581     0.434783
260     0.434783
          ...   
2029    0.108696
2203    0.108696
1994    0.108696
124     0.108696
29      0.108696
Name: count, Length: 557, dtype: float64

In [51]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index) & (ratings['rating'] >= 4))] 

In [53]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1.147880e+09
2,1,307,5.0,1.147869e+09
8,1,1237,5.0,1.147869e+09
9,1,1250,4.0,1.147868e+09
19,1,2692,5.0,1.147869e+09
...,...,...,...,...
15231530,98680,3147,5.0,1.437722e+09
15231534,98680,3175,4.0,1.498525e+09
15231575,98680,3671,5.0,1.437721e+09
15231620,98680,3996,4.0,1.437718e+09


In [55]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users['userId'].unique())

In [57]:
all_user_recs

movieId
318     0.438732
296     0.388869
356     0.366746
593     0.360164
2571    0.347801
          ...   
1311    0.000319
138     0.000267
1471    0.000257
108     0.000257
660     0.000216
Name: count, Length: 557, dtype: float64

**Now the prediction Model**

In [59]:
rec_perc = pd.concat([similar_user_recs, all_user_recs], axis = 1)
rec_perc.columns = ("similar", "all")

In [60]:
rec_perc

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
137,1.000000,0.000473
608,0.478261,0.225542
1,0.434783,0.235858
581,0.434783,0.007909
260,0.434783,0.323662
...,...,...
2029,0.108696,0.001512
2203,0.108696,0.010995
1994,0.108696,0.029055
124,0.108696,0.000987


In [61]:
rec_perc["score"] = rec_perc['similar']/rec_perc['all']

In [62]:
rec_perc

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
137,1.000000,0.000473,2113.652174
608,0.478261,0.225542,2.120496
1,0.434783,0.235858,1.843408
581,0.434783,0.007909,54.971448
260,0.434783,0.323662,1.343323
...,...,...,...
2029,0.108696,0.001512,71.892931
2203,0.108696,0.010995,9.886119
1994,0.108696,0.029055,3.740977
124,0.108696,0.000987,110.086051


In [64]:
rec_perc = rec_perc.sort_values('score', ascending = False)

In [65]:
rec_perc


Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
137,1.000000,0.000473,2113.652174
660,0.195652,0.000216,905.850932
138,0.217391,0.000267,812.943144
108,0.195652,0.000257,760.914783
601,0.239130,0.000329,726.567935
...,...,...,...
4226,0.130435,0.199428,0.654044
2028,0.108696,0.211472,0.513995
2959,0.130435,0.292755,0.445542
4993,0.108696,0.259802,0.418379


In [67]:
records = rec_perc.merge(movies, left_index=True, right_on='movieId')

In [68]:
records

Unnamed: 0,similar,all,score,movieId,title,genres,cleaned_title
135,1.000000,0.000473,2113.652174,137,Man of the Year (1995),Documentary,Man of the Year 1995
649,0.195652,0.000216,905.850932,660,August (1996),Drama,August 1996
136,0.217391,0.000267,812.943144,138,"Neon Bible, The (1995)",Drama,Neon Bible The 1995
106,0.195652,0.000257,760.914783,108,Catwalk (1996),Documentary,Catwalk 1996
593,0.239130,0.000329,726.567935,601,"Wooden Man's Bride, The (Yan shen) (1994)",Drama,Wooden Mans Bride The Yan shen 1994
...,...,...,...,...,...,...,...
4122,0.130435,0.199428,0.654044,4226,Memento (2000),Mystery|Thriller,Memento 2000
1939,0.108696,0.211472,0.513995,2028,Saving Private Ryan (1998),Action|Drama|War,Saving Private Ryan 1998
2867,0.130435,0.292755,0.445542,2959,Fight Club (1999),Action|Crime|Drama|Thriller,Fight Club 1999
4887,0.108696,0.259802,0.418379,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,Lord of the Rings The Fellowship of the Ring T...


In [77]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings["rating"] >= 4)]['movieId']
    
    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index) & (ratings['rating'] >= 4))]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users['userId'].unique())

    rec_perc = pd.concat([similar_user_recs, all_user_recs], axis = 1)
    rec_perc.columns = ("similar", "all")

    rec_perc["score"] = rec_perc['similar']/rec_perc['all']
    rec_perc = rec_perc.sort_values('score', ascending = False)

    records = rec_perc.merge(movies, left_index=True, right_on='movieId')
    return records['title'].head(10)
    

In [80]:
movie_name_input = widgets.Text(
    values='Type In:',
    description='Movie title: ',
    dasbled=False
)

recommendation_list = widgets.Output()

def on_type(data) :
    with recommendation_list:
        recommendation_list.clear_output()
        title = data ["new"]
        if len(title) > 3:
            results = search(title)
            movie_id = results. iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

            
movie_name_input.observe(on_type, names="value")

display(movie_name_input, recommendation_list)

Text(value='', description='Movie title: ')

Output()