In [2]:
import pandas as pd
movies = pd.read_csv("../movies.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
import re
# clear parentheses or punctuation from title column
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title.lower() # make the title lower case

In [5]:
#  Add a new column for clean title         
movies["clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,grumpier old men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,waiting to exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,father of the bride part ii 1995
...,...,...,...,...
62418,209157,We (2018),Drama,we 2018
62419,209159,Window of the Soul (2001),Documentary,window of the soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,bad poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),a girl thing 2001


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
# Use TF-IDF as a vectorizer
tfidf = vectorizer.fit_transform(movies["clean_title"])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# return movies based on title search input
def search(title):
    title = clean_title(title)
    #find tfidf of title input
    query_vec = vectorizer.transform([title])
    # measure similarity between vector value of title input and tfidf of all clean title  
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    # get the indices of the most similar 
    n = 5
    indices = np.argpartition(similarity, -n)[-n:]
    results = movies.iloc[indices].iloc[::-1]
    
    return results

In [None]:
import ipywidgets as widgets
from IPython.display import display

In [None]:
movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled=False
)

In [None]:
movie_list = widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 3:
            display(search(title))

movie_input.observe(on_type, names='value')
display(movie_input, movie_list)

In [None]:
ratings = pd.read_csv("../ratings.csv")

In [None]:
ratings.dtypes

In [None]:
movie_id = 89745

#def find_similar_movies(movie_id):
movie = movies[movies["movieId"] == movie_id]

In [None]:
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [None]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .10]

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages

In [None]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [None]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

In [None]:
def find_similar_movies(movie_id): # find movies that are similar to movie id that is input 
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique() # get a list of user id that rate the same movies of the input
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"] # get the records of the users that rate the same movies 
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users) # get the ratio between amount of times each movie get rated and amount of all users that rate the movie in the input  
    similar_user_recs = similar_user_recs[similar_user_recs > .10] # filter to get the movies with ratio more than 0.1 

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)] # get a list of all ratings of the movies related to the input
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique()) # get the ratio between the amount of times each movies get rated and total amount of users that rated the related movies  

    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1) # create a new dataframe with 2 columns: similar_user_recs and all_user_recs
    rec_percentages.columns = ["similar", "all"] # naming the 2 cols
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"] # add a new column score 
    rec_percentages = rec_percentages.sort_values("score", ascending=False) # sort the data frame from top to bottom by score

    return rec_percentages.head(10).merge(movies, left_index=False, left_on="movieId", right_on="movieId")[["movieId", "title", "genres", "score"]]

In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        
        results = search(str(data))
        movie_id = results['movieId'].iloc[0]

        display(find_similar_movies(movie_id))
movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

In [None]:
search("ice age")

In [None]:
!git add .

In [None]:
!git commit -m "latest 16 nov"

In [None]:
!git push