In [225]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [226]:
movies=pd.read_csv("movies.csv")

In [227]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [228]:
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)
movies["clean_title"]=movies["title"].apply(clean_title)

In [229]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [230]:
vectorizer=TfidfVectorizer(ngram_range=(1,2))

tfidf=vectorizer.fit_transform(movies['clean_title'])

In [231]:
def search(title, vectorizer=vectorizer, tfidf=tfidf, movies=movies):
    title = clean_title(title)
    
    # Transform the cleaned title into a vector
    query_vec = vectorizer.transform([title])
    
    # Calculate cosine similarity between the query title and all titles
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    
    # Get the indices of the top 5 most similar titles
    indices=np.argpartition(similarity,-5)[-5:]
    # Retrieve the top 5 similar titles
    results = movies.iloc[indices][::-1]
    
    return results


In [232]:
search("TOy story 1995")

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [233]:
movie_input = widgets.Text(
    value="Toy Story",
    description="Movie Title",
    disabled=False
)

movie_list=widgets.Output()
def ontype(data):
    with movie_list:
        movie_list.clear_output()
        title=data["new"]
        if title:
            display(search(title))
movie_input.observe(ontype,names='value')
display(movie_input,movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

In [234]:
ratings=pd.read_csv("ratings.csv")

In [235]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [236]:
movie_id=1
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()

In [237]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [238]:
similar_users_recs = ratings[ratings["userId"].isin(similar_users) & (ratings["rating"]>4)]["movieId"]

In [239]:
similar_users_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [240]:
similar_users_recs.value_counts()

movieId
1         13506
318        5599
260        5464
356        4690
296        4628
          ...  
27306         1
71732         1
4739          1
190187        1
97957         1
Name: count, Length: 16797, dtype: int64

In [241]:
similar_users_recs = similar_users_recs.value_counts()/len(similar_users)
similar_users_recs

movieId
1         1.000000
318       0.414556
260       0.404561
356       0.347253
296       0.342663
            ...   
27306     0.000074
71732     0.000074
4739      0.000074
190187    0.000074
97957     0.000074
Name: count, Length: 16797, dtype: float64

In [242]:
similar_users_recs = similar_users_recs[similar_users_recs>.1]

In [243]:
similar_users_recs

movieId
1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: count, Length: 92, dtype: float64

In [244]:
all_user=ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings['rating']>4)]

In [245]:
all_user_recs =all_user["movieId"].value_counts()/len(all_user["userId"].unique())

In [246]:
all_user_recs

movieId
318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: count, Length: 92, dtype: float64

In [247]:
rec_percentages=pd.concat([similar_users_recs,all_user_recs],axis=1)
rec_percentages.columns=["similar","all"]

In [248]:
rec_percentages

Unnamed: 0_level_0,similar,all
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [249]:
rec_percentages['score'] = rec_percentages["similar"]/rec_percentages["all"]

In [250]:
rec_percentages.sort_values('score',ascending=False)

Unnamed: 0_level_0,similar,all,score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [251]:
rec_percentages.head(10).merge(movies,left_index=True,right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
314,0.414556,0.345282,1.200632,318,"Shawshank Redemption, The (1994)",Crime|Drama,Shawshank Redemption The 1994
257,0.404561,0.224195,1.804507,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,Star Wars Episode IV A New Hope 1977
351,0.347253,0.23737,1.462916,356,Forrest Gump (1994),Comedy|Drama|Romance|War,Forrest Gump 1994
292,0.342663,0.28722,1.19303,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction 1994
1166,0.316304,0.189712,1.667285,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi,Star Wars Episode V The Empire Strikes Back 1980
1168,0.304605,0.166086,1.834016,1198,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,Raiders of the Lost Ark Indiana Jones and the ...
585,0.304235,0.22793,1.334777,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,Silence of the Lambs The 1991
522,0.299052,0.217132,1.37728,527,Schindler's List (1993),Drama|War,Schindlers List 1993
2480,0.296609,0.246217,1.204666,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,Matrix The 1999


In [258]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()
    similar_users_recs = ratings[ratings["userId"].isin(similar_users) & (ratings["rating"] > 4)]["movieId"]
    similar_users_recs = similar_users_recs.value_counts() / len(similar_users)

    similar_users_recs = similar_users_recs[similar_users_recs > 0.1]

    all_user = ratings[(ratings["movieId"].isin(similar_users_recs.index)) & (ratings['rating'] > 4)]
    all_user_recs = all_user["movieId"].value_counts() / len(all_user["userId"].unique())

    rec_percentages = pd.concat([similar_users_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages['score'] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values('score', ascending=False)  # Sort by score

    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]





In [259]:
movie_input_name = widgets.Text(
    value="Thor",
    description="Movie Title:",
    disable=False
)
recommendation_list=widgets.Output()
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title=data["new"]
        if(len(title)>3):
            results=search(title)
            movie_id=results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_input_name.observe(on_type,names="value")
display(movie_input_name,recommendation_list)

Text(value='Thor', description='Movie Title:')

Output()