In [1]:
# import libraries
import numpy as np
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from fuzzywuzzy import process
import pickle

from sklearn.impute import SimpleImputer

from sklearn.metrics.pairwise import cosine_similarity

# User based CS model

In [2]:
# TODO: make a frame zipping ratings with user id and title
umrT = pd.read_csv('../data/ml-latest-small/ratings.csv', na_values = 'Nan')
mtg = pd.read_csv('../data/ml-latest-small/movies.csv', na_values = 'Nan')
# merge the two frames based on the column movieid
movies = pd.merge(umrT, mtg, on='movieId')
movies['title'].unique().tolist()

['Toy Story (1995)',
 'Grumpier Old Men (1995)',
 'Heat (1995)',
 'Seven (a.k.a. Se7en) (1995)',
 'Usual Suspects, The (1995)',
 'From Dusk Till Dawn (1996)',
 'Bottle Rocket (1996)',
 'Braveheart (1995)',
 'Rob Roy (1995)',
 'Canadian Bacon (1995)',
 'Desperado (1995)',
 'Billy Madison (1995)',
 'Clerks (1994)',
 'Dumb & Dumber (Dumb and Dumber) (1994)',
 'Ed Wood (1994)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Pulp Fiction (1994)',
 'Stargate (1994)',
 'Tommy Boy (1995)',
 'Clear and Present Danger (1994)',
 'Forrest Gump (1994)',
 'Jungle Book, The (1994)',
 'Mask, The (1994)',
 'Blown Away (1994)',
 'Dazed and Confused (1993)',
 'Fugitive, The (1993)',
 'Jurassic Park (1993)',
 'Mrs. Doubtfire (1993)',
 "Schindler's List (1993)",
 'So I Married an Axe Murderer (1993)',
 'Three Musketeers, The (1993)',
 'Tombstone (1993)',
 'Dances with Wolves (1990)',
 'Batman (1989)',
 'Silence of the Lambs, The (1991)',
 'Pinocchio (1940)',
 'Fargo (1996)',
 'Mission: Impossible (1996)',

In [3]:
user_rating = {
    'lion king': 5,
    'terminator': 5,
    'star wars': 2
}

In [4]:
def get_movie_frame(method = 'user_similarity', umrT=umrT, mtg = mtg):

    if method == 'NMF':
        """
        i will get a Data Frame with movieId Title and userId 
        pivoted in a matrix with NaN where user has no rating
        """
        # use pivot to make the matrix of movie rates
        rates =umrT.pivot(index='userId',columns = 'movieId')
        rates.rename(columns=dict(zip(mtg["movieId"], mtg["title"])),inplace = True)
        movies = rates.rating
    elif method == 'user_similarity':
        """
        i will get a Data Frame with movieId Title and ratings per user.

        """
        rates =umrT.pivot(index='userId',columns = 'movieId')
        rates.rename(columns=dict(zip(mtg["movieId"], mtg["title"])),inplace = True)
        movies = rates.rating
    return movies
movies = get_movie_frame()
movies

movieId,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [5]:
def match_movie_title(input_title, movie_titles):
    """
    Matches inputed movie title to existing one in the list with fuzzywuzzy
    """
    matched_title = process.extractOne(input_title, movie_titles)[0]

    return matched_title

In [6]:
# add active user with rated movies and zeros
def create_user_vector(movies= get_movie_frame(), user_rating=user_rating):
    """
    Convert dict of user_ratings to a user_vector
    """       
    ##### add a new user with movies and ratings and NaN else #######

    # ------------------------------------------------------------ #
    user = pd.DataFrame(user_rating, index=[0])
    user_t = user.T.reset_index()
    # list of the entry movies
    user_movie_entries = list(user_t["index"])
    # list of the entry movies ratings
    user_rate_entries = list(user_t[0])
    #list of the movie titles of library
    movies = get_movie_frame()
    movie_titles = list(movies.columns)
    # # matches the movies from user with the library
    intended_movies = [match_movie_title(title, movie_titles) for title in user_movie_entries]
    # intended_movies
    # # create a frame with one user
    user_new = pd.DataFrame(movies.loc[1].copy())
    user_new.columns = [['0']]
    user_new[['0']] = 0
    for mov in user_new.index:
        for idx, int_mov in enumerate(intended_movies):
            if mov == int_mov:
                user_new.loc[int_mov] = user_rate_entries[idx]
    new_user = user_new.T
    return new_user
new_user = create_user_vector()

In [7]:
# combine new user with database
tabl = pd.concat([new_user, movies], axis = 0,ignore_index=True) 
tabl

movieId,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Gintama: The Movie (2010),anohana: The Flower We Saw That Day - The Movie (2013),Silver Spoon (2014),Love Live! The School Idol Movie (2015),Jon Stewart Has Left the Building (2015),Black Butler: Book of the Atlantic (2017),No Game No Life: Zero (2017),Flint (2017),Bungo Stray Dogs: Dead Apple (2018),Andrew Dice Clay: Dice Rules (1991)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [8]:
# Drop duplicate movies from data frame
table = tabl.T.groupby(level=0).first().T
table

movieId,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [9]:
# fill in NaN values with zeros
movie_CS_u = table.fillna(0)
movie_CS_u.sample(5)


movieId,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
486,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0


In [10]:
# create cosine similarity table for users
cosine_similarity(movie_CS_u)


array([[1.        , 0.02012786, 0.        , ..., 0.05625625, 0.10168148,
        0.03485702],
       [0.02012786, 1.        , 0.02728287, ..., 0.29109737, 0.09357193,
        0.14532081],
       [0.        , 0.02728287, 1.        , ..., 0.04621095, 0.0275654 ,
        0.10242675],
       ...,
       [0.05625625, 0.29109737, 0.04621095, ..., 1.        , 0.12199271,
        0.32205486],
       [0.10168148, 0.09357193, 0.0275654 , ..., 0.12199271, 1.        ,
        0.05322546],
       [0.03485702, 0.14532081, 0.10242675, ..., 0.32205486, 0.05322546,
        1.        ]])

In [11]:
# We can turn this into a dataframe:
cos_sim_table = pd.DataFrame(cosine_similarity(movie_CS_u),index=movie_CS_u.index, columns = movie_CS_u.index)
cos_sim_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
0,1.000000,0.020128,0.000000,0.000000,0.024433,0.163430,0.085438,0.071129,0.134060,0.000000,...,0.000000,0.182496,0.009601,0.076492,0.075990,0.028967,0.066229,0.056256,0.101681,0.034857
1,0.020128,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,...,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.000000,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,...,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.000000,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,...,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4,0.024433,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,...,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.028967,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,...,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
607,0.066229,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,...,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
608,0.056256,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,...,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
609,0.101681,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,...,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


In [12]:
# use the transposed version of R
R_t = movie_CS_u.T
R_t

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0
xXx (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,2.0
xXx: State of the Union (2005),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5
¡Three Amigos! (1986),0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# create a list of unseen movies for this user
unseen_movies = list(R_t.loc[~(R_t!=0).all(axis=1)].index)
unseen_movies

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)',
 '...All the Marbles (1981)',
 '...And Justice for All (1979)',
 '00 Schneider - Jagd auf Nihil Baxter (1994)',
 '1-900 (06) (1994)',
 '10 (1979)',
 '10 Cent Pistol (2015)',
 '10 Cloverfield Lane (2016)',
 '10 Items or Less (2006)',
 '10 Things I Hate About You (1999)',
 '10 Years (2011)',
 '10,000 BC (2008)',
 '100 Girls (2000)',
 '100 Streets (2016)',
 '101 Dalmatians (1996)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 "101 Dalmatians II: Patch's London Adventure (2003)",
 '101 Reykjavik (101 Reykjavík) (2000)',
 '102 Dalmatians (2000)',
 '10th & Wolf (2006)',
 '10th Kingdom, The (2000)',
 '10th Victim, The (La decima vittima) (1965)',
 '11\'09"01 - September 11 (2002)',
 '11:14 (2

In [14]:
# Create a list of top 3 similar user (nearest neighbours)
neighbours = list(cos_sim_table.iloc[0].sort_values(ascending=False).index[1:4])
neighbours

[81, 126, 349]

In [15]:

# create the recommendation (predicted/rated movie)
predicted_ratings_movies = []

for idx, movie in enumerate(unseen_movies):
    # we check the users who watched the movie
    people_who_have_seen_the_movie = list(R_t.columns[R_t.loc[movie] > 0])

    
    num = 0
    den = 0
    for user in neighbours:
        # if this person has seen the movie
        if user in people_who_have_seen_the_movie:
        #  we want extract the ratings and similarities
            rating = R_t.loc[movie,user]
            similarity = cos_sim_table.loc[0,user]
            num += rating*similarity
            den += similarity
    
    if den != 0:
        predicted_ratings = num/den
        predicted_ratings_movies.append([int(round(predicted_ratings,0)),movie])


In [16]:
# create df pred
def_pred = pd.DataFrame(predicted_ratings_movies,columns= ['rating', 'movie']).sort_values("rating", ascending=False)
def_pred

Unnamed: 0,rating,movie
37,5,Pretty Woman (1990)
51,5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
10,5,"Bridges of Madison County, The (1995)"
19,5,"Firm, The (1993)"
16,4,Die Hard: With a Vengeance (1995)
15,4,Dances with Wolves (1990)
29,4,"Lion King, The (1994)"
40,4,Schindler's List (1993)
41,4,Seven (a.k.a. Se7en) (1995)
42,4,"Shawshank Redemption, The (1994)"


In [17]:
list(def_pred.iloc[:3].movie)

['Pretty Woman (1990)',
 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)',
 'Bridges of Madison County, The (1995)']

# CS model based to movies

In [None]:
# fill in NaN values with zeros

# create cosine similarity table for movies

