In [None]:
import pandas as pd


In [5]:
movies=pd.read_csv("/content/movies.csv")

In [6]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [8]:
import re
def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]","",title)

In [9]:
 movies["clean_title"] = movies["title"].apply(clean_title)

In [10]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])

In [43]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results

In [44]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title:",
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
        display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [45]:
ratings=pd.read_csv("/content/ratings.csv")

In [46]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296.0,5.0,1.147880e+09
1,1,306.0,3.5,1.147869e+09
2,1,307.0,5.0,1.147869e+09
3,1,665.0,5.0,1.147879e+09
4,1,899.0,3.5,1.147869e+09
...,...,...,...,...
85373,647,9010.0,2.5,1.330432e+09
85374,647,27402.0,4.0,1.506807e+09
85375,647,27660.0,3.0,1.456428e+09
85376,647,27904.0,3.5,1.509057e+09


In [48]:
ratings.dtypes

userId         int64
movieId      float64
rating       float64
timestamp    float64
dtype: object

In [77]:
movie_id = 1

In [78]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()

In [79]:
similar_users

array([ 36,  75,  86,  90,  93,  95,  96,  98, 120, 127, 143, 152, 158,
       162, 186, 188, 211, 229, 230, 249, 259, 297, 298, 302, 329, 355,
       359, 369, 371, 381, 392, 428, 435, 447, 468, 477, 484, 513, 537,
       540, 541, 551, 553, 561, 582, 609, 611, 623, 624, 631, 644])

In [80]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [81]:
similar_user_recs

5101       1.0
5105      34.0
5111     110.0
5114     150.0
5127     260.0
         ...  
85171    356.0
85173    380.0
85182    588.0
85183    589.0
85186    593.0
Name: movieId, Length: 2784, dtype: float64

In [82]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [83]:
similar_user_recs

movieId
1.0       1.000000
318.0     0.490196
356.0     0.352941
260.0     0.333333
527.0     0.313725
            ...   
2396.0    0.117647
1036.0    0.117647
4226.0    0.117647
2324.0    0.117647
1201.0    0.117647
Name: count, Length: 69, dtype: float64

In [84]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [85]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [86]:
all_user_recs

movieId
318.0     0.339559
296.0     0.263158
527.0     0.251273
593.0     0.234295
2571.0    0.230900
            ...   
733.0     0.047538
5989.0    0.047538
2396.0    0.045840
500.0     0.042445
3114.0    0.040747
Name: count, Length: 69, dtype: float64

In [87]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [88]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,1.000000,0.120543
318.0,0.490196,0.339559
356.0,0.352941,0.227504
260.0,0.333333,0.208829
527.0,0.313725,0.251273
...,...,...
2396.0,0.117647,0.045840
1036.0,0.117647,0.083192
4226.0,0.117647,0.113752
2324.0,0.117647,0.083192


In [89]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [90]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [91]:
rec_percentages

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,1.000000,0.120543,8.295775
3114.0,0.196078,0.040747,4.812092
1073.0,0.196078,0.047538,4.124650
34.0,0.176471,0.054329,3.248162
588.0,0.235294,0.074703,3.149733
...,...,...,...
2858.0,0.156863,0.154499,1.015298
1221.0,0.117647,0.122241,0.962418
858.0,0.176471,0.183362,0.962418
4993.0,0.137255,0.152801,0.898257


In [92]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.120543,8.295775,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.196078,0.040747,4.812092,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
1047,0.196078,0.047538,4.12465,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
33,0.176471,0.054329,3.248162,34,Babe (1995),Children|Drama,Babe 1995
580,0.235294,0.074703,3.149733,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.235294,0.076401,3.079739,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
718,0.137255,0.047538,2.887255,733,"Rock, The (1996)",Action|Adventure|Thriller,Rock The 1996
495,0.117647,0.042445,2.771765,500,Mrs. Doubtfire (1993),Comedy|Drama,Mrs Doubtfire 1993
2025,0.137255,0.052632,2.607843,2115,Indiana Jones and the Temple of Doom (1984),Action|Adventure|Fantasy,Indiana Jones and the Temple of Doom 1984
2826,0.156863,0.061121,2.566449,2918,Ferris Bueller's Day Off (1986),Comedy,Ferris Buellers Day Off 1986


In [95]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [99]:
movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()