In [1]:
import pandas as pd

movies = pd.read_csv("C:/Users/RUDRA/Downloads/ml-25m/movies.csv")

In [2]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
import re

def clean_title(title):
    re.sub("[^a-zA-Z0-9 ]", "", title)
    return title.lower().strip()



In [4]:
movies["clean_title"] = movies["title"].apply(clean_title)



In [5]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story (1995)
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men (1995)
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii (1995)
...,...,...,...,...
62418,209157,We (2018),Drama,we (2018)
62419,209159,Window of the Soul (2001),Documentary,window of the soul (2001)
62420,209163,Bad Poems (2018),Comedy|Drama,bad poems (2018)
62421,209169,A Girl Thing (2001),(no genres listed),a girl thing (2001)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 2))

def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np  

tfidf = vectorizer.fit_transform(movies["clean_title"])

def search(title):
    title = "".join(title.split())
    if not title:
        return pd.DataFrame(columns=["title", "genres"])    
    title = clean_title(title)
    query_vector = vectorizer.transform([title])
    similarity = cosine_similarity(query_vector, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    return results


In [8]:
results = search("Toy Story")
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story (1995)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama,women of devil's island (1962)
62421,209169,A Girl Thing (2001),(no genres listed),a girl thing (2001)
62420,209163,Bad Poems (2018),Comedy|Drama,bad poems (2018)
62419,209159,Window of the Soul (2001),Documentary,window of the soul (2001)


In [9]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="Toy Story",
    description='Movie Title:',
    disabled=False
)

movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title  = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [10]:
ratings = pd.read_csv("C:/Users/RUDRA/Downloads/ml-25m/ratings.csv")

In [11]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [12]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
movie_id = 1

In [14]:
similar_users =ratings[(ratings["userId"]== movie_id) & (ratings["rating"]>4)]["userId"].unique()

In [15]:
similar_users

array([1])

In [16]:
similar_user_recs= ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]

In [17]:
similar_user_recs

0       296
2       307
3       665
5      1088
8      1237
9      1250
11     1653
16     2351
17     2573
18     2632
19     2692
20     2843
21     3448
22     3569
23     3949
24     4144
26     4325
28     4703
29     4973
30     5147
33     5767
34     5878
36     5952
37     6016
38     6370
39     6377
41     6711
43     7209
44     7234
48     7361
49     7365
54     7940
56     8154
57     8327
58     8360
62     8786
64     8973
66    27266
69    32591
Name: movieId, dtype: int64

In [18]:
similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [19]:
similar_user_recs

0       296
2       307
3       665
5      1088
8      1237
9      1250
11     1653
16     2351
17     2573
18     2632
19     2692
20     2843
21     3448
22     3569
23     3949
24     4144
26     4325
28     4703
29     4973
30     5147
33     5767
34     5878
36     5952
37     6016
38     6370
39     6377
41     6711
43     7209
44     7234
48     7361
49     7365
54     7940
56     8154
57     8327
58     8360
62     8786
64     8973
66    27266
69    32591
Name: movieId, dtype: int64

In [20]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4)]

In [21]:
all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [22]:
all_users_recs

movieId
34    0.246930
17    0.199007
36    0.189498
39    0.169115
21    0.167997
16    0.167838
62    0.150170
11    0.137090
2     0.120147
58    0.106820
29    0.083026
19    0.059246
3     0.053338
22    0.051537
48    0.051000
5     0.045890
41    0.040998
24    0.038530
18    0.033042
28    0.031431
69    0.031314
44    0.028614
26    0.019991
23    0.018118
43    0.015592
9     0.013559
57    0.013037
20    0.010569
30    0.009582
8     0.005720
64    0.004950
66    0.003325
54    0.003121
38    0.002613
49    0.001699
37    0.000363
33    0.000290
56    0.000116
Name: count, dtype: float64

In [23]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [24]:
rec_percentages

Unnamed: 0,similar,all
0,296,
2,307,0.120147
3,665,0.053338
5,1088,0.04589
8,1237,0.00572
9,1250,0.013559
11,1653,0.13709
16,2351,0.167838
17,2573,0.199007
18,2632,0.033042


In [25]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [26]:
rec_percentages = rec_percentages.sort_values("score", ascending=False) 

In [27]:
rec_percentages

Unnamed: 0,similar,all,score
56,8154,0.000116,70207980.0
33,5767,0.00029,19862120.0
37,6016,0.000363,16575760.0
66,27266,0.003325,8201470.0
49,7365,0.001699,4336034.0
54,7940,0.003121,2543828.0
38,6370,0.002613,2437657.0
64,8973,0.00495,1812546.0
69,32591,0.031314,1040766.0
57,8327,0.013037,638731.0


In [28]:
rec_percentages.head(10).merge(movies, left_index=True, right_on ="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
55,8154,0.000116,70207980.0,56,Kids of the Round Table (1995),Adventure|Children|Comedy|Fantasy,kids of the round table (1995)
32,5767,0.00029,19862120.0,33,Wings of Courage (1995),Adventure|Romance|IMAX,wings of courage (1995)
36,6016,0.000363,16575760.0,37,Across the Sea of Time (1995),Documentary|IMAX,across the sea of time (1995)
65,27266,0.003325,8201470.0,66,Lawnmower Man 2: Beyond Cyberspace (1996),Action|Sci-Fi|Thriller,lawnmower man 2: beyond cyberspace (1996)
48,7365,0.001699,4336034.0,49,When Night Is Falling (1995),Drama|Romance,when night is falling (1995)
53,7940,0.003121,2543828.0,54,"Big Green, The (1995)",Children|Comedy,"big green, the (1995)"
37,6370,0.002613,2437657.0,38,It Takes Two (1995),Children|Comedy,it takes two (1995)
63,8973,0.00495,1812546.0,64,Two if by Sea (1996),Comedy|Romance,two if by sea (1996)
68,32591,0.031314,1040766.0,69,Friday (1995),Comedy,friday (1995)
56,8327,0.013037,638731.0,57,Home for the Holidays (1995),Drama,home for the holidays (1995)


In [29]:
def find_similar_users(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] >= 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] >= 4)]
    all_users_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score","title","genres"]]

In [30]:
movie_input = widgets.Text(
    value="Toy Story",
    description='Movie Title:',
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_users(movie_id))

movie_input.observe(on_type, names='value')

display(movie_input, recommendation_list)
        

Text(value='Toy Story', description='Movie Title:')

Output()