In [1]:
import pandas as pd

! curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  3  955k    3 32663    0     0  11924      0  0:01:22  0:00:02  0:01:20 11951
  9  955k    9 90003    0     0  24469      0  0:00:39  0:00:03  0:00:36 24510
 21  955k   21  201k    0     0  41006      0  0:00:23  0:00:05  0:00:18 41065
 29  955k   29  279k    0     0  50394      0  0:00:19  0:00:05  0:00:14 59721
 45  955k   45  432k    0     0  64457      0  0:00:15  0:00:06  0:00:09 86883
 54  955k   54  519k    0     0  69187      0  0:00:14  0:00:07  0:00:07   98k
 71  955k   71  684k    0     0  79986      0  0:00:12  0:00:08  0:00:04  117k
 80  955k   80  769k    0     0  79978      0  0:00

In [2]:
import zipfile
with zipfile.ZipFile('ml-latest-small.zip', 'r') as zip_ref:
    zip_ref.extractall()

In [3]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [5]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [6]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Black Butler Book of the Atlantic 2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,No Game No Life Zero 2017
9739,193585,Flint (2017),Drama,Flint 2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Bungo Stray Dogs Dead Apple 2018


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title',
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [10]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [11]:
movie_id = 1

In [12]:
# finding users who liked the same movies as us

similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] > 4)]['userId'].unique()
similar_users

array([  7,  17,  31,  40,  43,  46,  57,  63,  71,  73,  96,  98, 145,
       151, 159, 166, 169, 171, 177, 201, 206, 220, 229, 234, 240, 247,
       252, 254, 269, 270, 273, 275, 280, 282, 288, 304, 328, 341, 347,
       353, 357, 364, 367, 378, 380, 382, 389, 396, 411, 438, 448, 451,
       453, 456, 460, 471, 484, 488, 533, 559, 562, 573, 584, 587, 610])

In [13]:
similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
similar_user_recs

874            1
875           50
877          150
879          260
880          356
           ...  
100821    160527
100829    164179
100832    168248
100833    168250
100834    168252
Name: movieId, Length: 3754, dtype: int64

In [14]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]
similar_user_recs

movieId
1        1.000000
318      0.430769
296      0.400000
356      0.384615
593      0.369231
           ...   
8368     0.107692
1097     0.107692
74458    0.107692
1219     0.107692
733      0.107692
Name: count, Length: 103, dtype: float64

In [15]:
all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
all_users

Unnamed: 0,userId,movieId,rating,timestamp
3,1,47,5.0,964983815
4,1,50,5.0,964982931
15,1,260,5.0,964981680
25,1,457,5.0,964981909
28,1,527,5.0,964984002
...,...,...,...,...
100227,610,51255,5.0,1479542571
100310,610,58559,4.5,1493844688
100326,610,60069,4.5,1493844866
100429,610,74458,4.5,1479542157


In [16]:
all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())
all_user_recs

movieId
318      0.362007
296      0.299283
356      0.277778
2571     0.268817
2959     0.232975
           ...   
8636     0.044803
899      0.039427
733      0.037634
78499    0.037634
500      0.030466
Name: count, Length: 103, dtype: float64

In [17]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ['similar', 'all']
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.116487
318,0.430769,0.362007
296,0.400000,0.299283
356,0.384615,0.277778
593,0.369231,0.229391
...,...,...
8368,0.107692,0.055556
1097,0.107692,0.069892
74458,0.107692,0.060932
1219,0.107692,0.062724


In [18]:
rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']
rec_percentages = rec_percentages.sort_values('score', ascending=False)
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.116487,8.584615
3114,0.307692,0.059140,5.202797
78499,0.184615,0.037634,4.905495
500,0.138462,0.030466,4.544796
8961,0.184615,0.057348,3.219231
...,...,...,...
110,0.184615,0.181004,1.019954
4973,0.107692,0.105735,1.018514
79132,0.123077,0.123656,0.995318
2571,0.261538,0.268817,0.972923


In [19]:
rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.116487,8.584615,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
2355,0.307692,0.05914,5.202797,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
7355,0.184615,0.037634,4.905495,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
436,0.138462,0.030466,4.544796,500,Mrs. Doubtfire (1993),Comedy|Drama,Mrs Doubtfire 1993
5374,0.184615,0.057348,3.219231,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
32,0.169231,0.053763,3.147692,34,Babe (1995),Children|Drama,Babe 1995
2038,0.169231,0.053763,3.147692,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,Ghostbusters aka Ghost Busters 1984
506,0.246154,0.082437,2.985953,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
592,0.107692,0.037634,2.861538,733,"Rock, The (1996)",Action|Adventure|Thriller,Rock The 1996
5260,0.123077,0.044803,2.747077,8636,Spider-Man 2 (2004),Action|Adventure|Sci-Fi|IMAX,SpiderMan 2 2004


In [20]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings['movieId'] == movie_id) & (ratings['rating'] > 4)]['userId'].unique()
    similar_user_recs = ratings[(ratings['userId'].isin(similar_users)) & (ratings['rating'] > 4)]['movieId']
    
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    all_users = ratings[(ratings['movieId'].isin(similar_user_recs.index)) & (ratings['rating'] > 4)]
    all_user_recs = all_users['movieId'].value_counts() / len(all_users['userId'].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ['similar', 'all']

    rec_percentages['score'] = rec_percentages['similar'] / rec_percentages['all']

    rec_percentages = rec_percentages.sort_values('score', ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on='movieId')[['score', 'title', 'genres']]

In [21]:
movie_input_name = widgets.Text(
    values='Toy Story',
    description='Movie Title:',
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]['movieId']
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names='value')
display(movie_input_name, recommendation_list)

Text(value='', description='Movie Title:')

Output()