In [48]:
import pandas as pd
movies = pd.read_csv("../movies.csv")

In [49]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [50]:
import re
# clear parentheses or punctuation from title column
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [51]:
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["clean_title"])


In [53]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# return movies based on search input
def search(title):
    title = clean_title(title)
    #find tfidf of title input
    query_vec = vectorizer.transform([title])
    # measure similarity between vector value of title input and tfidf of all clean title  
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # get the indices of the most similar 
    n = 5
    indices = np.argpartition(similarity, -n)[-n:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [54]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

In [87]:
movie_list = widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [57]:
ratings = pd.read_csv("../ratings.csv")

In [58]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [153]:
def find_similar_movies(movie_id): # find movies that are similar to movie id that is input 
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique() # get a list of user id that rate the same movies of the input
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"] # get the records of the users that rate the same movies 
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users) # get the ratio between amount of times each movie get rated and amount of all users that rate the movie in the input  
    similar_user_recs = similar_user_recs[similar_user_recs > .10] # filter to get the movies with ratio more than 0.1 

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)] # get a list of all ratings of the movies related to the input
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) # get the ratio between the amount of times each movies get rated and total amount of users that rated the related movies  

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1) 
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    return rec_percentages.head(10).merge(movies, left_index=False, left_on="movieId", right_on="movieId")[["movieId", "title", "genres", "score"]]

In [151]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        
        results = search(str(data))
        movie_id = results['movieId'].iloc[0]

        display(find_similar_movies(movie_id))
movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()

In [121]:
search("ice age")

Unnamed: 0,movieId,title,genres,clean_title
26350,125874,2012: Ice Age (2011),Action|Adventure|Sci-Fi,2012 Ice Age 2011
5110,5218,Ice Age (2002),Adventure|Animation|Children|Comedy,Ice Age 2002
23401,117922,Ice Age: A Mammoth Christmas (2011),Adventure|Animation|Children,Ice Age A Mammoth Christmas 2011
10666,44022,Ice Age 2: The Meltdown (2006),Adventure|Animation|Children|Comedy,Ice Age 2 The Meltdown 2006
38945,156025,Ice Age: The Great Egg-Scapade (2016),Adventure|Animation|Children|Comedy,Ice Age The Great EggScapade 2016


In [156]:
!git add .



In [157]:
!git commit -m "latest 16 nov"

[main 3cdeb09] latest 16 nov
 1 file changed, 9 insertions(+), 9 deletions(-)


In [158]:
!git push

remote: Permission to SoLitaP/IR-Final-Project.git denied to Norakpichit.
fatal: unable to access 'https://github.com/SoLitaP/IR-Final-Project.git/': The requested URL returned error: 403
