In [None]:
import pandas as pd
import numpy as np

In [None]:
movies=pd.read_csv("/content/movies.csv")
print(movies)


       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
import re
def clean_title(title):
  return re.sub("[^a-zA-Z0-9]","",title)

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,ToyStory1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji1995
2,3,Grumpier Old Men (1995),Comedy|Romance,GrumpierOldMen1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,WaitingtoExhale1995
4,5,Father of the Bride Part II (1995),Comedy,FatheroftheBridePartII1995
...,...,...,...,...
62418,209157,We (2018),Drama,We2018
62419,209159,Window of the Soul (2001),Documentary,WindowoftheSoul2001
62420,209163,Bad Poems (2018),Comedy|Drama,BadPoems2018
62421,209169,A Girl Thing (2001),(no genres listed),AGirlThing2001


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
#def search(title):
title="Toy Story "
title=clean_title(title)
query_vec=vectorizer.transform([title])
similarity=cosine_similarity(query_vec,tfidf).flatten()
indices=np.argpartition(similarity,-5)[-5:]
results=movies.iloc[indices[::-1]]

In [None]:
results

Unnamed: 0,movieId,title,genres,clean_title
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,WomenofDevilsIsland1962
20808,107573,Apnea (Apnoia) (2010),Drama,ApneaApnoia2010
20806,107563,"Princess for Christmas, A (2011)",Children|Comedy,PrincessforChristmasA2011
20804,107557,Fun Size (2012),Comedy,FunSize2012
20807,107565,"Fuck You, Goethe (Fack Ju Göhte) (2013)",Comedy,FuckYouGoetheFackJuGhte2013


In [None]:

import ipywidgets as widgets
from IPython.display import display
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:

movie_id = 89745
movie = movies[movies["movieId"] == movie_id]

In [None]:

ratings = pd.read_csv("ratings.csv")

In [None]:
ratings.dtypes


userId         int64
movieId        int64
rating       float64
timestamp    float64
dtype: object

In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:

similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:

rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages


Unnamed: 0,similar,all
89745,1.000000,0.039113
58559,0.575893,0.139689
59315,0.486607,0.050288
79132,0.477679,0.126069
2571,0.437500,0.244805
...,...,...
166461,0.102679,0.010651
76251,0.102679,0.016064
4878,0.102679,0.070368
85414,0.102679,0.014493


In [None]:

rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [None]:

rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
17067,1.0,0.039113,25.566964,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,AvengersThe2012
25058,0.241071,0.01327,18.166001,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,AvengersAgeofUltron2015
19678,0.209821,0.011699,17.935035,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,IronMan32013
16312,0.15625,0.008731,17.896875,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor2011
25061,0.129464,0.007334,17.65338,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan2015
16725,0.178571,0.010127,17.632389,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,CaptainAmericaTheFirstAvenger2011
21348,0.28125,0.016239,17.319556,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,CaptainAmericaTheWinterSoldier2014
25071,0.232143,0.014493,16.017857,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,CaptainAmericaCivilWar2016
21606,0.241071,0.016413,14.687405,111362,X-Men: Days of Future Past (2014),Action|Adventure|Sci-Fi,XMenDaysofFuturePast2014
25074,0.125,0.008905,14.036765,122926,Untitled Spider-Man Reboot (2017),Action|Adventure|Fantasy,UntitledSpiderManReboot2017


In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [None]:

import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()