In [None]:
import pandas as pd
movies = pd.read_csv("movies.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
movies.shape

(62423, 3)

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Cleaning movie titles with regex (i.e., removing all the extra chracters like () or - ,etc

*  Regex(or regular expression) is a string manipulation tool used in ML/DL model
*  used in the preprocessing steps to clean and organize textual data.
*   Text Cleaning / tokenization / searching and matching / feature extraction





In [None]:
import re

def clean_title(title):
  #serach for info which is not a-z, A-Z, 0-9 and remove them
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


# Creating the Search Engine

Creating a TFIDF (term frequency) matrix

tf*idf vector

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2)) #ngram-groups of consecutive words

tfidf = vectorizer.fit_transform(movies["clean_title"])

## Creating a search function using cosine similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten() #compare query with evry title
    indices = np.argpartition(similarity, -5)[-5:] #5 most simliar terms
    results = movies.iloc[indices].iloc[::-1]

    return results


Search box widget

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) >= 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

## Searching users who like te same movies as you

In [None]:
ratings = pd.read_csv("ratings.csv")

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
209842,1481,586,2.5,1357831772
209843,1481,587,3.0,1358430669
209844,1481,588,1.0,1357831561
209845,1481,589,3.5,1357831524


In [None]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

Finding users who liked the same movie as us

In [None]:
movie_id = 1
#def find_similar_movies(movie_id):
  #movie = movies[movies["movieId"] == movie_id]

In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
similar_users

array([  36,   75,   86,   90,   93,   95,   96,   98,  111,  120,  127,
        143,  152,  158,  160,  162,  171,  186,  188,  211,  217,  229,
        230,  235,  249,  257,  259,  297,  298,  302,  323,  329,  355,
        359,  369,  371,  381,  392,  402,  411,  428,  435,  439,  447,
        449,  468,  469,  477,  484,  513,  519,  537,  540,  541,  548,
        551,  553,  561,  567,  573,  582,  593,  607,  609,  611,  623,
        624,  626,  628,  631,  644,  653,  654,  670,  683,  686,  694,
        697,  702,  709,  727,  733,  741,  749,  752,  765,  768,  773,
        785,  791,  793,  796,  803,  805,  807,  811,  830,  834,  839,
        848,  856,  896,  904,  905,  911,  927,  947,  950,  956,  966,
        969,  986,  997, 1007, 1010, 1013, 1036, 1038, 1042, 1065, 1079,
       1092, 1096, 1101, 1118, 1123, 1131, 1138, 1140, 1141, 1143, 1146,
       1150, 1159, 1166, 1167, 1169, 1171, 1176, 1179, 1192, 1196, 1198,
       1199, 1200, 1228, 1230, 1232, 1240, 1242, 12

In [None]:
#movie

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
           ...  
209785    170355
209791    177765
209795    187541
209799    190089
209801    198185
Name: movieId, Length: 12385, dtype: int64

In [None]:
similar_user_recs.value_counts()

1         177
318        76
260        63
593        63
296        58
         ... 
2394        1
2124        1
2118        1
2013        1
198185      1
Name: movieId, Length: 3380, dtype: int64

In [None]:
# find movies >10% of users like us liked
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [None]:
similar_user_recs

1       1.000000
318     0.429379
260     0.355932
593     0.355932
296     0.327684
          ...   
923     0.101695
1207    0.101695
1220    0.101695
1304    0.101695
3000    0.101695
Name: movieId, Length: 110, dtype: float64

finding people who rated the movie the same or more

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [None]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
209667,1480,78499,4.5,1557795750
209809,1481,47,4.5,1357831583
209828,1481,356,4.5,1357831363
209840,1481,527,5.0,1359449889


In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:
all_user_recs

318      0.338384
296      0.283550
2571     0.235209
593      0.235209
356      0.220779
           ...   
50872    0.035354
745      0.033189
78499    0.028139
1028     0.025974
2355     0.024531
Name: movieId, Length: 110, dtype: float64

###creating a recommendation score

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.127706
318,0.429379,0.338384
260,0.355932,0.209235
593,0.355932,0.235209
296,0.327684,0.283550
...,...,...
923,0.101695,0.059885
1207,0.101695,0.048341
1220,0.101695,0.038240
1304,0.101695,0.043290


In [None]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.127706,7.830508
3114,0.242938,0.048341,5.025550
78499,0.135593,0.028139,4.818774
2355,0.112994,0.024531,4.606181
1028,0.112994,0.025974,4.350282
...,...,...,...
778,0.101695,0.085859,1.184447
296,0.327684,0.283550,1.155648
58559,0.146893,0.130592,1.124824
2959,0.214689,0.200577,1.070357


In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.127706,7.830508,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.242938,0.048341,5.02555,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
14813,0.135593,0.028139,4.818774,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
2264,0.112994,0.024531,4.606181,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
1005,0.112994,0.025974,4.350282,1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,Mary Poppins 1964
4780,0.231638,0.067821,3.415435,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
729,0.112994,0.033189,3.404569,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995
1047,0.146893,0.047619,3.084746,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
580,0.186441,0.060606,3.076271,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
8246,0.186441,0.064214,2.903447,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004


## Recommendation Function

In [None]:
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]

    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

Final recommendation widget

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) >= 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()