In [1]:
import pandas as pd
import numpy as np
import function as fn

# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import ipywidgets as widgets
from IPython.display import display


In [2]:
movie = pd.read_csv("/Users/nontanatto/Desktop/movie rec/ml-25m/movies.csv")
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


Cleaning 

In [3]:
movie['cleaned title'] = movie['title'].apply(fn.clean_title)

In [4]:
movie

Unnamed: 0,movieId,title,genres,cleaned title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [5]:
vectorizer = TfidfVectorizer(ngram_range = (1, 2))
tfidf = vectorizer.fit_transform(movie["cleaned title"])

In [6]:
# search engine do TF*idf
# invers Document Frequency -> how unique of particular word
def search(title):
    
    title = fn.clean_title(title)
    query_vector = vectorizer.transform([title])

    similarity = cosine_similarity(query_vector, tfidf).flatten()
    index = np.argpartition(similarity, -5)[-5:]
    return movie.iloc[index][::-1]

In [7]:
search('Avengers')

Unnamed: 0,movieId,title,genres,cleaned title
34536,145676,3 Avengers (1964),(no genres listed),3 Avengers 1964
2063,2153,"Avengers, The (1998)",Action|Adventure,Avengers The 1998
17067,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
40636,159920,Shaolin Avengers (1994),Action,Shaolin Avengers 1994
45394,170297,Ultimate Avengers 2 (2006),Action|Animation|Sci-Fi,Ultimate Avengers 2 2006


In [8]:
# show the filling box
movie_input = widgets.Text(
    value="",
    description = "Movie Title",
    disabled = False
)

In [9]:
movie_list = widgets.Output()

In [10]:
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        print(data)
        print(title)
        if len(title) > 5:
            display(search(title))



In [11]:
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='', description='Movie Title')

Output()

# reccomendation

In [12]:
rating = pd.read_csv("/Users/nontanatto/Desktop/movie rec/ml-25m/ratings.csv")

In [13]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [14]:
rating.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

## find the user who also like the same movie

In [15]:
similar_users = rating[(rating['movieId'] == 1) & (rating["rating"] > 4)]["userId"].unique()
similar_users

array([    36,     75,     86, ..., 162527, 162530, 162533])

In [16]:
similar_user_rec = rating[(rating["userId"].isin(similar_users)) & (rating["rating"] > 4)]
similar_user_rec

Unnamed: 0,userId,movieId,rating,timestamp
5101,36,1,5.0,857131378
5105,36,34,5.0,834413787
5111,36,110,5.0,834412999
5114,36,150,5.0,839928587
5127,36,260,5.0,857131062
...,...,...,...,...
24998854,162533,60069,4.5,1280919889
24998861,162533,67997,4.5,1280920712
24998876,162533,78499,4.5,1281405901
24998884,162533,81591,4.5,1297289876


In [17]:
similar_user_rec = similar_user_rec["movieId"]
similar_user_rec

5101            1
5105           34
5111          110
5114          150
5127          260
            ...  
24998854    60069
24998861    67997
24998876    78499
24998884    81591
24998888    88129
Name: movieId, Length: 1358326, dtype: int64

## recommend the movie that 10% or more of the user that similar to us like the movie

In [18]:
similar_user_rec = similar_user_rec.value_counts()/len(similar_users)
similar_user_rec = similar_user_rec[similar_user_rec > .1]
similar_user_rec

1        1.000000
318      0.445607
260      0.403770
356      0.370215
296      0.367295
           ...   
953      0.103053
551      0.101195
1222     0.100876
745      0.100345
48780    0.100186
Name: movieId, Length: 113, dtype: float64

## find how much all user in the dataset like this movie

In [19]:
similar_user_rec.index

Int64Index([    1,   318,   260,   356,   296,  2571,  1196,  1198,   593,
              527,
            ...
             8368,  4896,  1259, 59315,   778,   953,   551,  1222,   745,
            48780],
           dtype='int64', length=113)

In [20]:
all_user = rating[(rating['movieId'].isin(similar_user_rec.index)) & (rating["rating"] > 4)]

In [21]:
all_user

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000062,162541,5618,4.5,1240953299
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484


In [22]:
all_user_rec = all_user["movieId"].value_counts()/len(all_user["userId"].unique())
all_user_rec

318      0.342220
296      0.284674
2571     0.244033
356      0.235266
593      0.225909
           ...   
551      0.040918
50872    0.039111
745      0.037031
78499    0.035131
2355     0.025091
Name: movieId, Length: 113, dtype: float64

## Creating A recommendation Score

In [23]:
# Compare the percent
rec_percentages = pd.concat([similar_user_rec, all_user_rec], axis = 1)
rec_percentages

Unnamed: 0,movieId,movieId.1
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [24]:
# Set column
rec_percentages.columns = ["similar", "all"]
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.124728
318,0.445607,0.342220
260,0.403770,0.222207
356,0.370215,0.235266
296,0.367295,0.284674
...,...,...
953,0.103053,0.045792
551,0.101195,0.040918
1222,0.100876,0.066877
745,0.100345,0.037031


In [25]:
# Similar : How much user similar to us like them
# all : How much just the averag like them

In [26]:
rec_percentages["score"] = rec_percentages['similar']/rec_percentages['all']
# use above equation because score แปรผันตรง similar but
#                            score แปรผกผัน all

rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
318,0.445607,0.342220,1.302105
260,0.403770,0.222207,1.817089
356,0.370215,0.235266,1.573604
296,0.367295,0.284674,1.290232
...,...,...,...
953,0.103053,0.045792,2.250441
551,0.101195,0.040918,2.473085
1222,0.100876,0.066877,1.508376
745,0.100345,0.037031,2.709748


In [27]:
rec_percentages =rec_percentages.sort_values("score", ascending=False)
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.124728,8.017414
3114,0.280648,0.053706,5.225654
2355,0.110539,0.025091,4.405452
78499,0.152960,0.035131,4.354038
4886,0.235147,0.070811,3.320783
...,...,...,...
2858,0.216724,0.167634,1.292845
296,0.367295,0.284674,1.290232
79132,0.166817,0.131384,1.269693
4973,0.142501,0.112405,1.267747


In [28]:
# merge the top 10 score with movie table
rec_percentages.head(10).merge(movie, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,cleaned title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bug s Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


## Build function

In [None]:
def find_similar_movies(movie_id):
    # Find user similar to us
    
    # Adjusting over 10% of the user recommend that particular movie
    
    # fining how common the reccommendations were among all of the users
    
    # creating the score