READING DATA AND FILES

In [1]:
import pandas as pd
movies = pd.read_csv("D:/movies.csv")

In [2]:
ratings = pd.read_csv("D:/ml-25m/ratings.csv")

FUNCTION TO CLEAN MOVIE TITLES/PREPROCESSING


In [3]:
import re
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]","",title)

In [4]:
movies["Clean_title"] = movies["title"].apply(clean_title)
movies

Unnamed: 0,movieId,title,genres,Clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


CONVERTING THE CLEANED MOVIE TITLES INTO VECTORS

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer as TV

vectorizer = TV(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies["Clean_title"])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec,tfidf).flatten()
    indices = np.argpartition(similarity,-5)[-5:]
    results = movies.iloc[indices][::-1]
    return results


CREATING WIDGETS TO SEARCH MOVIE TITLE

In [7]:
import ipywidgets as widgets 
from IPython.display import display

movie_input = widgets.Text(
    value = "",
    description = "Movie Title: ",
    disabled = False

)


In [8]:

movie_list = widgets.Output()

def on_type(data):
    with movie_list: 
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 1:
            display(search(title))

movie_input.observe(on_type, names = 'value')
display(movie_input,movie_list)


Text(value='', description='Movie Title: ')

Output()

TRIALS

In [9]:
movie_id = 1

similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
#similar_users


In [10]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]


In [11]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [12]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [13]:
all_user_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [14]:
rec_percentage = pd.concat([similar_user_recs, all_user_rec], axis = 1)
rec_percentage.columns = ["similar", "all"]


In [15]:
rec_percentage["score"] = rec_percentage ["similar"] / rec_percentage ["all"]
rec_percentage = rec_percentage.sort_values("score", ascending= False)


In [16]:
rec_percentage.head(10).merge(movies, left_index= True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,Clean_title
0,1.0,0.124728,8.017414,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.280648,0.053706,5.225654,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.110539,0.025091,4.405452,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.15296,0.035131,4.354038,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
4780,0.235147,0.070811,3.320783,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
580,0.216618,0.067513,3.208539,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
6258,0.228139,0.072268,3.156862,6377,Finding Nemo (2003),Adventure|Animation|Children|Comedy,Finding Nemo 2003
587,0.1794,0.059977,2.99115,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
8246,0.203504,0.068453,2.972889,8961,"Incredibles, The (2004)",Action|Adventure|Animation|Children|Comedy,Incredibles The 2004
359,0.253411,0.085764,2.954762,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,Lion King The 1994


FINAL MODEL/ALGORITHM


In [17]:
def find_similar_movie(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]

    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_user_rec = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

    rec_percentage = pd.concat([similar_user_recs, all_user_rec], axis = 1)
    rec_percentage.columns = ["similar", "all"]

    rec_percentage["score"] = rec_percentage ["similar"] / rec_percentage ["all"]
    rec_percentage = rec_percentage.sort_values("score", ascending= False)

    return rec_percentage.head(10).merge(movies, left_index= True, right_on="movieId")[["score", "title", "genres"]]
       

In [18]:
movie_input_name = widgets.Text(
    value = "",
    description = "Movie Title: ",
    disable = False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title)>1:
            results = search(title)
            movie_id = results.iloc[[0],["movieId"]]
            display(find_similar_movie(movie_id))

movie_input_name.observe(on_type, names = "value")

display(movie_input_name, recommendation_list)


Text(value='', description='Movie Title: ')

Output()