In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
moviesDF = pd.read_csv("movies.csv")
moviesDF

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [3]:
def cleanTitle(title):
    cleaned = re.sub("[^a-zA-Z0-9 ]", "", title)
    # cleaned = title.replace("(", "")
    # cleaned = cleaned.replace(")", "")
    return cleaned

In [4]:
# moviesDF.drop("clean_title", inplace=True, axis=1)
moviesDF["cleanTitle"] = moviesDF["title"].apply(cleanTitle)
moviesDF

Unnamed: 0,movieId,title,genres,cleanTitle
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# to give each word a frequency then do the idf(inverse document frequency) 
# a vector for each title and compare against all titles in the dataset (vectors)
from sklearn.metrics.pairwise import cosine_similarity
# compare similarities of entered title and titles in the dataset



In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
# bigram, look for each two words together
tfidf = vectorizer.fit_transform(moviesDF["cleanTitle"]) 
# turn the set of titles into a matrix (sets ofnumbers)

In [7]:
def search(title):
    title = cleanTitle(title)
    queryVec = vectorizer.transform([title]) # turn the entered search title into set of numbers
    similarity = cosine_similarity(queryVec, tfidf).flatten() # compare query term to each of dataset titles and see the similarities
    # indices = np.argmax(similarity)
    # indices = np.argsort(similarity)[-5:][::-1]
    indices = np.argpartition(similarity, -5) [-5:]
    result = moviesDF.iloc[indices][::-1]
    return result # [['title','genres']]
# search("batman")

In [8]:
import ipywidgets as widgets
from IPython.display import display

In [9]:
def whileTyping(data):
    with moviesList:
        moviesList.clear_output()
        title = data["new"] # the value entered in the inut box will be saved in a dic in a key called new
        if len(title) > 3:
            display(search(title))

In [10]:
movieTitle = widgets.Text(value="",
                          description="Movie Title:",
                          disaled=False)

moviesList = widgets.Output()
            
movieTitle.observe(whileTyping, names="value")
display(movieTitle, moviesList)

Text(value='', description='Movie Title:')

Output()

In [11]:
ratingsDF = pd.read_csv("ratings.csv")
ratingsDF

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [12]:
ratingsDF.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [13]:
def findRecommendations(movieID):
    similarUsers = ratingsDF[(ratingsDF["movieId"] == movieID) & (ratingsDF["rating"] >= 4)]["userId"].unique() # users who liked the same movie as us
    usersRecs = ratingsDF[(ratingsDF["userId"].isin(similarUsers)) & (ratingsDF["rating"] >=4)]["movieId"] # gets the movies liked by those users
    usersRecs = usersRecs.value_counts() / len(similarUsers)
    usersRecs = usersRecs[usersRecs> 0.10] # need movies that are recommended more than 10% from the users
    
    allUsers = ratingsDF[(ratingsDF["movieId"].isin(usersRecs.index)) & (ratingsDF["rating"] >= 4)] # all users who liked the movies the users who like our movie likes
    allUsersRecs = allUsers["movieId"].value_counts() / len(allUsers["userId"].unique())
    
    recsPerc = pd.concat([usersRecs, allUsersRecs], axis=1)
    recsPerc.columns = ["similar", "all"]
    
    recsPerc["score"] = recsPerc['similar'] / recsPerc['all']
    recsPerc = recsPerc.sort_values("score", ascending=False)
    
    mergedPercentages = recsPerc.head(20).merge(moviesDF, left_index=True, right_on="movieId") [["score", "title", "genres"]]
    return mergedPercentages

In [14]:
def whileTypingRecs(data):
    with recomList:
        recomList.clear_output()
        title = data["new"] 
        if len(title) > 3:
            results = search(title)
            movieID = results.iloc[0]['movieId']
            display(findRecommendations(movieID))

In [15]:
likedMovieTitle = widgets.Text(value="",
                          description="Movie Title:",
                          disaled=False)

recomList = widgets.Output()
            
likedMovieTitle.observe(whileTypingRecs, names="value")
display(likedMovieTitle, recomList)

Text(value='', description='Movie Title:')

Output()