In [None]:
import pandas as pd
import numpy as np
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
import re
def clean(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [4]:
movies["clean_title"] = movies["title"].apply(clean)

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


In [6]:
movies.shape

(10329, 4)

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
title = "Harry Potter"
title = clean(title)
query = vectorizer.transform([title])

In [9]:
query

<1x34566 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [10]:
title = "Men 1995"
title = clean(title)
query = vectorizer.transform([title])
similarity = cosine_similarity(query, tfidf).flatten()

In [11]:
similarity

array([0.10114346, 0.12230767, 0.56952989, ..., 0.        , 0.        ,
       0.        ])

In [12]:
title = "Toy Story 1995"
title = clean(title)
query = vectorizer.transform([title])
similarity = cosine_similarity(query, tfidf).flatten()
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices].iloc[::-1]

In [13]:
indices

array([4403, 3838, 2496, 8599,    0], dtype=int64)

In [14]:
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
8599,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2496,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
3838,4929,"Toy, The (1982)",Comedy,Toy The 1982
4403,5843,Toy Soldiers (1991),Action|Drama,Toy Soldiers 1991


In [15]:
similarity

array([1.        , 0.08720778, 0.05760422, ..., 0.        , 0.        ,
       0.        ])

In [16]:
def search(title):
    title = clean(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices].iloc[::-1]
    return results

In [17]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='The Hulk',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='The Hulk', description='Movie Title:')

Output()

In [18]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [19]:
ratings.shape

(105339, 4)

In [20]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [21]:
movie_id = 1

In [22]:
#anyone who wacthed toy story
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [23]:
similar_users

array([  2,   8,  17,  30,  38,  71,  72,  88,  96, 108, 109, 116, 122,
       147, 151, 156, 158, 165, 171, 187, 198, 213, 224, 278, 282, 286,
       289, 299, 303, 328, 335, 339, 347, 350, 387, 393, 399, 405, 409,
       432, 439, 440, 454, 455, 460, 462, 471, 484, 511, 531, 552, 555,
       560, 561, 571, 572, 575, 580, 589, 597, 627, 632, 637, 648, 650,
       662], dtype=int64)

In [24]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [25]:
similar_user_recs

113         1
117        17
120        36
123        62
130       608
         ... 
99024    2747
99027    2804
99028    2819
99029    2918
99033    3168
Name: movieId, Length: 5263, dtype: int64

In [26]:
similar_user_recs.value_counts()

1        66
260      31
318      30
1210     29
1198     28
         ..
27727     1
46970     1
50011     1
6586      1
2067      1
Name: movieId, Length: 1861, dtype: int64

In [27]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [28]:
similar_user_recs

1       1.000000
260     0.469697
318     0.454545
1210    0.439394
1198    0.424242
          ...   
1954    0.106061
17      0.106061
2542    0.106061
1259    0.106061
2174    0.106061
Name: movieId, Length: 166, dtype: float64

In [29]:
total_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [30]:
total_users

Unnamed: 0,userId,movieId,rating,timestamp
12,1,260,4.5,1217895864
23,1,527,4.5,1217896341
27,1,593,5.0,1217895932
35,1,858,5.0,1217896428
36,1,912,5.0,1217897623
...,...,...,...,...
100354,668,1617,4.5,1134431064
100671,668,2396,4.5,1137826734
101315,668,3996,5.0,1215019197
103470,668,48516,5.0,1173424620


In [32]:
total_users_recs = total_users["movieId"].value_counts() / len(total_users["userId"].unique())

In [33]:
total_users_recs

318     0.315705
296     0.266026
2571    0.245192
356     0.240385
527     0.224359
          ...   
2174    0.025641
1285    0.024038
2355    0.022436
736     0.020833
3033    0.017628
Name: movieId, Length: 166, dtype: float64

In [34]:
rec_percentages = pd.concat([similar_user_recs, total_users_recs], axis=1)
rec_percentages.columns = ["similar", "total"]

In [35]:
rec_percentages

Unnamed: 0,similar,total
1,1.000000,0.105769
260,0.469697,0.216346
318,0.454545,0.315705
1210,0.439394,0.145833
1198,0.424242,0.165064
...,...,...
1954,0.106061,0.035256
17,0.106061,0.051282
2542,0.106061,0.044872
1259,0.106061,0.060897


In [36]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["total"]

In [37]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [39]:
rec_percentages.head(10)

Unnamed: 0,similar,total,score
1,1.0,0.105769,9.454545
2355,0.136364,0.022436,6.077922
3033,0.106061,0.017628,6.016529
1223,0.181818,0.030449,5.971292
3114,0.30303,0.051282,5.909091
1517,0.166667,0.030449,5.473684
745,0.227273,0.041667,5.454545
736,0.106061,0.020833,5.090909
78499,0.151515,0.030449,4.976077
1407,0.121212,0.025641,4.727273


In [40]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,total,score,movieId,title,genres,clean_title
0,1.0,0.105769,9.454545,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1866,0.136364,0.022436,6.077922,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
2423,0.106061,0.017628,6.016529,3033,Spaceballs (1987),Comedy|Sci-Fi,Spaceballs 1987
984,0.181818,0.030449,5.971292,1223,"Grand Day Out with Wallace and Gromit, A (1989)",Adventure|Animation|Children|Comedy|Sci-Fi,Grand Day Out with Wallace and Gromit A 1989
2496,0.30303,0.051282,5.909091,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
1225,0.166667,0.030449,5.473684,1517,Austin Powers: International Man of Mystery (1...,Action|Adventure|Comedy,Austin Powers International Man of Mystery 1997
626,0.227273,0.041667,5.454545,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995
621,0.106061,0.020833,5.090909,736,Twister (1996),Action|Adventure|Romance|Thriller,Twister 1996
8599,0.151515,0.030449,4.976077,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
1143,0.121212,0.025641,4.727273,1407,Scream (1996),Comedy|Horror|Mystery|Thriller,Scream 1996


In [41]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    total_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    total_user_recs = total_users["movieId"].value_counts() / len(total_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, total_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [42]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()