In [None]:
## INSTALL IF YOU DON'T HAVE SURPRISE ON YOUR MACHINE AND WOULD LIKE TO RUN THE CODE
#!pip install surprise
#!pip install import-ipynb

In [9]:
import os
import csv
import pandas as pd
import numpy as np
import heapq
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle
from random import *
from surprise import accuracy
from collections import defaultdict
from surprise import KNNBasic
from collections import defaultdict
from operator import itemgetter
import NetflixLoadData as NetflixLoadData

# Explanation

# Notebook

In [10]:
use_pickle_file = True
max_n = 2500000 #how many rows we want from data_ratings and data_rating_plus_movie_title
reader = Reader(line_format='user item rating', rating_scale=(1, 5))

## Get data

In [12]:
data_movies, data_rating, data_rating_plus_movie_title, _ = NetflixLoadData.get_data_files(use_small_dataset=True)

In [13]:
data_rating

Unnamed: 0,movie_id,customer_id,rating
0,1,1488844,3.0
1,1,822109,5.0
2,1,885013,4.0
3,1,30878,4.0
4,1,823519,3.0
...,...,...,...
813413,4499,101554,4.0
813414,4499,1334851,3.0
813415,4499,1852040,1.0
813416,4499,185372,1.0


## Helper functions

In [14]:
def save_to_pickle(name, df):
    path_name = "pickle/"+name+".pickle"
    pickle_file = open(path_name,"wb")
    pickle.dump(df, pickle_file)
    pickle_file.close()

def load_pickle(name):
    path_name = "pickle/"+name+".pickle"
    return_input = open(path_name, "rb")
    return pickle.load(return_input)

## Finding information about the data before doing anything major

In [15]:
import MovieCustomerInformation as information
max_rating = 4
min_rating = 4

### Details about all customers/movies

#### finding out size of the data being used

In [128]:
print("number of customers")
number_of_customers =  len(information.all_average_ratings(df=data_rating, type='customer_id')['customer_id'])
print(number_of_customers)

print(len(data_rating["customer_id"].unique()))

number of customers
4829
4829


In [127]:
number_of_movies = len(information.all_average_ratings(df=data_rating, type='movie_id')['movie_id'])
print("Number of movies rated", number_of_movies)
print("Total movies in dataset", len(data_movies["movie_id"].unique()))

Number of movies rated 4499
Total movies in dataset 17770


#### average rating

In [122]:
customer_average_rating = information.all_average_ratings(df=data_rating, type='customer_id')["avg_rating"].mean()
print("All customers average ratings", customer_average_rating)

All customers average ratings 3.5321404327111


In [123]:
movie_average_rating = information.all_average_ratings(df=data_rating, type='movie_id')['avg_rating'].mean()
print("All movies average ratings", movie_average_rating)

All movies average ratings 2.7833812231234294


In [130]:
# get the average movie rating for all customers
# used to determine if this user typically gives bad or good reviews
# and then we can see if he really hates or loves a movie
all_customers_average_ratings =  information.all_average_ratings(df=data_rating, type='customer_id')
## sort by how many ratings person has given (rating count)
print("Average rating for all customers (sorted by count)")
print(all_customers_average_ratings.sort_values(by=[('rating', 'count')], ascending=False))

Average rating for all customers (sorted by count)
     customer_id   rating       avg_rating
                      sum count           
549       305344   8464.0  4467   1.894784
727       387418   8079.0  4422   1.827001
4425     2439493   4973.0  4195   1.185459
3049     1664010  17104.0  4019   4.255785
3861     2118461  15390.0  3769   4.083311
...          ...      ...   ...        ...
870       470861      5.0     1   5.000000
1731      931793      5.0     1   5.000000
3953     2176039      3.0     1   3.000000
368       196497      5.0     1   5.000000
3652     1989766      4.0     1   4.000000

[4829 rows x 4 columns]


In [129]:
all_movies_average_rating = information.all_average_ratings(df=data_rating, type='movie_id')
## sort by how many ratings movie has (rating count)
print("Average rating for all movies (sorted by count)")
print(all_movies_average_rating.sort_values(by=[('rating', 'count')], ascending=False))

Average rating for all movies (sorted by count)
     movie_id   rating       avg_rating
                   sum count           
1904     1905  11738.0  3030   3.873927
570       571  12222.0  3021   4.045680
2451     2452  11777.0  2736   4.304459
4305     4306  10877.0  2642   4.116957
2861     2862  11235.0  2638   4.258908
...       ...      ...   ...        ...
1140     1141      7.0     4   1.750000
4293     4294     15.0     4   3.750000
886       887     14.0     4   3.500000
2536     2537      3.0     2   1.500000
4372     4373      1.0     1   1.000000

[4499 rows x 4 columns]


In [132]:
print("As seen as seen here below customer with customer_id = 305344 (which is the top rater) is in 4 out of 5 of the lowest rated movies,")
print("this shows that he could be responsible for many of the items having a low score in the data.")
print(data_rating[data_rating["movie_id"] == 4373])
print(data_rating[data_rating["movie_id"] == 2537])
print(data_rating[data_rating["movie_id"] == 887])
print(data_rating[data_rating["movie_id"] == 4294])
print(data_rating[data_rating["movie_id"] == 1141])

As seen as seen here below customer with customer_id = 305344 (which is the top rater) is in 4 out of 5 of the lowest rated movies,
this shows that he could be responsible for many of the items having a low score in the data.
        movie_id  customer_id  rating
789920      4373       305344     1.0
        movie_id  customer_id  rating
457875      2537       199769     2.0
457876      2537       305344     1.0
        movie_id  customer_id  rating
166319       887      1314869     3.0
166320       887      1899913     3.0
166321       887      2086129     3.0
166322       887       908626     5.0
        movie_id  customer_id  rating
769926      4294      2536523     5.0
769927      4294       721369     4.0
769928      4294       984703     5.0
769929      4294       305344     1.0
        movie_id  customer_id  rating
206926      1141      1471238     3.0
206927      1141       305344     1.0
206928      1141      2056022     2.0
206929      1141       387418     1.0


#### rating (low/high)

In [133]:
customer_ratings_low = information.get_avg_rating_less_than(df=all_customers_average_ratings , max_rating=max_rating)
print("Showing customers with scores lower than ", max_rating)
customer_ratings_low

Showing customers with scores lower than  4


Unnamed: 0_level_0,customer_id,rating,rating,avg_rating
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1
2,967,87.0,24,3.625000
3,1333,2007.0,750,2.676000
4,2442,404.0,118,3.423729
5,3321,2718.0,1062,2.559322
7,3718,529.0,175,3.022857
...,...,...,...,...
4824,2646591,163.0,51,3.196078
4825,2647871,814.0,248,3.282258
4826,2648122,306.0,85,3.600000
4827,2648650,83.0,27,3.074074


In [135]:
customer_ratings_high = information.get_avg_rating_higher_than(df=all_customers_average_ratings, min_rating=min_rating)
print("Showing customers with scores higher (or equal) to ", min_rating)
customer_ratings_high

Showing customers with scores higher (or equal) to  4


Unnamed: 0_level_0,customer_id,rating,rating,avg_rating
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1
0,685,124.0,30,4.133333
1,915,30.0,7,4.285714
6,3417,165.0,41,4.024390
12,5225,108.0,26,4.153846
18,8117,1062.0,260,4.084615
...,...,...,...,...
4800,2632269,103.0,25,4.120000
4804,2634414,33.0,8,4.125000
4806,2635437,499.0,115,4.339130
4807,2635895,314.0,75,4.186667


In [23]:
print("number of rows in low", len(customer_ratings_low))
print("number of rows in high", len(customer_ratings_high))

number of rows in low 4039
number of rows in high 721


In [28]:
customers_low_high_ratings_percentage = ((len(customer_ratings_high)-len(customer_ratings_low)) / len(customer_ratings_low))*100
if(customers_low_high_ratings_percentage <= 0):
    print("high rating dataframe is " , round(customers_low_high_ratings_percentage, 2),'% smaller than low rating dataframe')

if(customers_low_high_ratings_percentage > 0):
    print("high rating dataframe is " , round(customers_low_high_ratings_percentage, 2),'% bigger than low rating dataframe')

high rating dataframe is  -82.15 % smaller than low rating dataframe


#### average rating (low/high)

In [136]:
print("Movies with avg_rating score less than ", max_rating)
information.get_avg_rating_less_than(df=all_movies_average_rating, max_rating=max_rating)

Movies with avg_rating score less than  4


Unnamed: 0_level_0,movie_id,rating,rating,avg_rating
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1
0,1,2051.0,547,3.749543
1,2,516.0,145,3.558621
2,3,7326.0,2012,3.641153
3,4,389.0,142,2.739437
4,5,4468.0,1140,3.919298
...,...,...,...,...
4494,4495,162.0,60,2.700000
4495,4496,2676.0,713,3.753156
4496,4497,220.0,84,2.619048
4497,4498,88.0,36,2.444444


In [137]:
print("Movies with avg_rating score higher (or equal) to ", min_rating)
information.get_avg_rating_higher_than(df=all_movies_average_rating, min_rating=min_rating)

Movies with avg_rating score higher (or equal) to  4


Unnamed: 0_level_0,movie_id,rating,rating,avg_rating
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1
12,13,38.0,8,4.75
222,223,3594.0,875,4.107429
240,241,5359.0,1317,4.069096
570,571,12222.0,3021,4.04568
752,753,33.0,8,4.125
871,872,5000.0,1206,4.145937
885,886,7719.0,1918,4.024505
1019,1020,3501.0,842,4.157957
1255,1256,925.0,227,4.07489
1475,1476,1936.0,447,4.331096


In [145]:
print("count of ratings by each user")
print(all_customers_average_ratings[[('customer_id',''),('rating', 'count')]].sort_values(('rating', 'count'), ascending=False))

count of ratings by each user
     customer_id rating
                  count
549       305344   4467
727       387418   4422
4425     2439493   4195
3049     1664010   4019
3861     2118461   3769
...          ...    ...
870       470861      1
1731      931793      1
3953     2176039      1
368       196497      1
3652     1989766      1

[4829 rows x 2 columns]


find customers who rated a random movie

In [158]:
movie_to_find = data_movies['movie_title'][randrange(1000)]
information.get_customers_who_rated_movie_title(movie_title=movie_to_find)

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
22898,143,1331154,5.0,1997.0,The Game
22899,143,1644750,4.0,1997.0,The Game
22900,143,2031561,3.0,1997.0,The Game
22901,143,1997470,3.0,1997.0,The Game
22902,143,2467008,4.0,1997.0,The Game
...,...,...,...,...,...
24092,143,1535440,3.0,1997.0,The Game
24093,143,2456457,5.0,1997.0,The Game
24094,143,2118159,4.0,1997.0,The Game
24095,143,2154579,2.0,1997.0,The Game


In [159]:
information.get_avg_rating_for_movie_title(movie_title=movie_to_find)

Unnamed: 0_level_0,movie_id,rating,rating,avg_rating,movie_id,movie_id,movie_id
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1,movie_id,movie_year,movie_title
142,143,4288.0,1199,3.576314,143,1997.0,The Game


finding customers who rated a specific movie

In [74]:

information.get_customers_who_rated_movie_title(movie_title="Harold and Kumar Go to White Castle")

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
51402,290,1436762,4.0,2004.0,Harold and Kumar Go to White Castle
51403,290,1745265,5.0,2004.0,Harold and Kumar Go to White Castle
51404,290,1176140,2.0,2004.0,Harold and Kumar Go to White Castle
51405,290,439011,5.0,2004.0,Harold and Kumar Go to White Castle
51406,290,2632461,4.0,2004.0,Harold and Kumar Go to White Castle
...,...,...,...,...,...
52628,290,999312,5.0,2004.0,Harold and Kumar Go to White Castle
52629,290,520675,4.0,2004.0,Harold and Kumar Go to White Castle
52630,290,1550207,2.0,2004.0,Harold and Kumar Go to White Castle
52631,290,1424777,3.0,2004.0,Harold and Kumar Go to White Castle


In [75]:
information.get_avg_rating_for_movie_title(movie_title="Harold and Kumar Go to White Castle")

Unnamed: 0_level_0,movie_id,rating,rating,avg_rating,movie_id,movie_id,movie_id
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1,movie_id,movie_year,movie_title
289,290,4351.0,1231,3.534525,290,2004.0,Harold and Kumar Go to White Castle


In [160]:
def generate_random_user_ids(from_n, to_n):
    ids = np.unique(data_rating['customer_id'])[from_n:to_n]
    print("random user id used while in development\n")
    return ids
## use when developing the program and need new customer_ids
#generate_random_user_ids(1000, 1500)

### Details about a specific customer

In [162]:
# ids I like -> 28812, 56514, 56520, 1488844, 84100, 534046
customer_id_use = 534046

In [164]:
print("Movies/TV Shows user", customer_id_use, "has rated")
information.all_id_rows(df=data_rating_plus_movie_title, type="customer_id", item_id=customer_id_use)

Movies/TV Shows user 534046 has rated


Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
3131,5,534046,5.0,2004.0,The Rise and Fall of ECW
8700,30,534046,5.0,2003.0,Something's Gotta Give
12247,52,534046,5.0,2002.0,The Weather Underground
31214,187,534046,3.0,2002.0,Death to Smoochy
34441,194,534046,4.0,1996.0,Arliss: The Best of Arliss
51683,290,534046,4.0,2004.0,Harold and Kumar Go to White Castle
111420,571,534046,4.0,1999.0,American Beauty
187786,1029,534046,1.0,1995.0,Tromeo and Juliet
210575,1148,534046,4.0,1981.0,For Your Eyes Only
289148,1642,534046,5.0,1995.0,Casino: 10th Anniversary Edition


In [166]:
print("User", customer_id_use, "stats (sum of all ratings, count of ratings, avg_rating)")
information.get_item_avg_rating(df=all_customers_average_ratings, type='customer_id', item_id=customer_id_use)

User 534046 stats (sum of all ratings, count of ratings, avg_rating)


Unnamed: 0_level_0,customer_id,rating,rating,avg_rating
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,count,Unnamed: 4_level_1
1005,534046,113.0,28,4.035714


In [168]:
#display_movies_customer_rated_higher_than(customer_id=customer_id_use, min_rating=4)
print("Movies/TV Shows user", customer_id_use, "has rated higher (or equal) to", min_rating)
information.display_movies_customer_rated_higher_than(df=data_rating_plus_movie_title, customer_id=customer_id_use, min_rating=min_rating)

Movies/TV Shows user 534046 has rated higher (or equal) to 4
                                  movie_title  rating
movie_id                                             
5                    The Rise and Fall of ECW     5.0
30                     Something's Gotta Give     5.0
52                    The Weather Underground     5.0
194                Arliss: The Best of Arliss     4.0
290       Harold and Kumar Go to White Castle     4.0
571                           American Beauty     4.0
1148                       For Your Eyes Only     4.0
1642         Casino: 10th Anniversary Edition     5.0
1719       The Life Aquatic with Steve Zissou     5.0
1798                            Lethal Weapon     4.0
1843                    Beverly Hills Cop III     4.0
2178                                     Lock     4.0
2464                                 Trekkies     4.0
2782                               Braveheart     4.0
2862                 The Silence of the Lambs     5.0
2939              Mic

In [169]:
print("Movies/TV Shows user", customer_id_use, "has rated lower than", max_rating)
information.display_movies_customer_rated_lower_than(df= data_rating_plus_movie_title,customer_id=customer_id_use, max_rating=max_rating)

Movies/TV Shows user 534046 has rated lower than 4
                  movie_title  rating
movie_id                             
187          Death to Smoochy     3.0
1029        Tromeo and Juliet     1.0
1700            Bottle Rocket     3.0
1744        Beverly Hills Cop     3.0
2840      Fantasm Comes Again     1.0
average rating 1005    4.035714
Name: avg_rating, dtype: float64


In [173]:
print("(Results in this cell is the same as above but now shows clearer which movies/tv shows customer likes and which he hates)")
information.get_users_loved_hated_movies(df=data_rating_plus_movie_title, customer_id=customer_id_use, minmax_rating=4)

(Results in this cell is the same as above but now shows clearer which movies/tv shows customer likes and which he hates)
User 534046 loved these movies
The Rise and Fall of ECW
Something's Gotta Give
The Weather Underground
Arliss: The Best of Arliss
Harold and Kumar Go to White Castle
American Beauty
For Your Eyes Only
Casino: 10th Anniversary Edition
The Life Aquatic with Steve Zissou
Lethal Weapon
Beverly Hills Cop III
Lock
Trekkies
Braveheart
The Silence of the Lambs
Michael Moore Hates America
The Godfather
Lethal Weapon 3
Goldfinger
The People vs. Larry Flynt
Garden State
A Mighty Wind
Road to Perdition

and disliked these movies
Death to Smoochy
Tromeo and Juliet
Bottle Rocket
Beverly Hills Cop
Fantasm Comes Again


In [177]:
print("showing user", customer_id_use, "average_rating against all other users that have given a rating\n")
all_customers_avg_rating = all_customers_average_ratings["avg_rating"].mean()

usr = information.all_average_ratings(df=data_rating, type='customer_id')
usr = usr[usr["customer_id"] == 596533]

change = ((usr["avg_rating"] - all_customers_avg_rating) / all_customers_avg_rating)*100


print("average rating for user:", float(usr["avg_rating"]))
print("average rating for all users:", all_customers_avg_rating)
print("difference between them", round(float(change), 2), "%")

showing user 534046 average_rating against all other users that have given a rating

average rating for user: 4.571428571428571
average rating for all users: 3.5321404327111
difference between them 29.42 %


## Recommendations with collaborative filtering

#### functions

In [84]:
from surprise import dataset, KNNBaseline, accuracy
from surprise.model_selection import train_test_split, LeaveOneOut

def get_drop_list(type="movie_id"):
    #>IF MOVIE:  movie_id, movie rating count, movie rating mean
    #>IF CUSTOMER: customer_id, custumer rating count, customer rating mean
    df_count_mean_summary = data_rating.groupby(type)['rating'].agg(['count', 'mean'])
    df_count_mean_summary.index = df_count_mean_summary.index.map(int)
    #>IF MOVIE: returns 1799.0 as a benchmark number
    #IF CUSTOMER: returns 52.0 as a benchmark number
    benchmark = round(df_count_mean_summary['count'].quantile(0.7),0)
    # drop all rows below benchmark
    df_drop_list = df_count_mean_summary[df_count_mean_summary['count'] < benchmark]
    # return all indexes to drop
    return df_drop_list

def get_customer_recommendations(customer_id, predictor):
    #> returns movie_id, movie_year, movie_title
    chosen_customer_pred = data_movies.copy()
    # fails if movie_id is the index so we have to reset the index back to normal (0-N)
    chosen_customer_pred = chosen_customer_pred.reset_index()
    # makes sure that we only pick movies that are not in the movie dropped list
    chosen_customer_pred = chosen_customer_pred[~chosen_customer_pred['movie_id'].isin(df_movie_drop_list)]
    # make prediction for customer with id = <customer_id> and put it into 'estimated_score'
    chosen_customer_pred['estimated_score'] = chosen_customer_pred['movie_id'].apply(lambda x: predictor.predict(customer_id, x).est)
    # sort by 'estimated score'
    chosen_customer_pred = chosen_customer_pred.sort_values('estimated_score', ascending=False).set_index('movie_id')
    return chosen_customer_pred

# print out movies/tv show user has previously rated
def display_rated_content(customer_id=customer_id_use, number_to_show=20):
    print("Movies/TV Shows rated by customer", customer_id)
    #df = all_customers_id_plus_movie_title_rows(customer_id=customer_id)
    df = information.all_id_rows(df=data_rating_plus_movie_title, type='customer_id', item_id=customer_id)
    df = df[['movie_title', 'rating']].sort_values('rating', ascending=False)
    print(df[0:number_to_show].set_index('movie_title'))

def display_customers_recommendations(customer_id=customer_id_use, df=[], number_to_show=20):
    print("Movies/TV Shows recommended to customer")
    tmp_df = df[['movie_title', 'estimated_score']][0:number_to_show]
    tmp_df = tmp_df.set_index('movie_title')
    print(tmp_df)

def display_recommendation(customer_id, number_to_show, predictor):
    chosen_customer_pred = get_customer_recommendations(customer_id=customer_id, predictor=predictor)
    display_customers_recommendations(customer_id=customer_id, df=chosen_customer_pred, number_to_show=number_to_show)

def print_evaluation_accuracy(predictions):
    print("\nEvaluating accuracy of model...")
    print("RMSE: ", accuracy.rmse(predictions, verbose=False))
    print("MSE: ", accuracy.mse(predictions, verbose=False))
    print("MAE: ", accuracy.mae(predictions, verbose=False))
    # FCP = Fraction of Concordant Pairs
    print("FCP: ", accuracy.mae(predictions, verbose=False))

#### create recommenders

In [85]:
## creating a list of the movies that we don't wanna include
df_movie_drop_list = get_drop_list(type="movie_id")
#df_customer_drop_list = get_summary(type="customer_id")

In [86]:
def test_against_our_custom_function():
    _, data_rating, _, _ = NetflixLoadData.get_data_files(use_small_dataset=True)
    dataset = Dataset.load_from_df(data_rating[['customer_id', 'movie_id', 'rating']], reader)
    fullTrainset = dataset.build_full_trainset()

    trainSet, testSet = train_test_split(dataset, test_size=.25, random_state=1)

    algo = SVD(random_state=10)
    algo.fit(trainSet)

    predictions = algo.test(testSet)
    print("\nEvaluating accuracy of model...")
    print("RMSE: ", accuracy.rmse(predictions, verbose=False))
    print("MSE: ", accuracy.mse(predictions, verbose=False))
    print("MAE: ", accuracy.mae(predictions, verbose=False))
    # FCP = Fraction of Concordant Pairs
    print("FCP: ", accuracy.mae(predictions, verbose=False))


class convert_to_raw_ratings(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in zip(df['customer_id'], df['movie_id'], df['rating'])]
        self.reader=reader

raw_ratings = convert_to_raw_ratings(data_rating, reader)
print("\nBuilding recommendation model...")
trainSet, testSet = train_test_split(raw_ratings, test_size=.25, random_state=1)

algo = SVD(random_state=10)
algo.fit(trainSet)
algo_predictions = algo.test(testSet)


Building recommendation model...


In [87]:
data_movies

Unnamed: 0,movie_id,movie_year,movie_title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [88]:
save_to_pickle("algorithm_svd", algo)
# #save_to_pickle("algo_raw_rating", raw_ratings)
# save_to_pickle("algo_trainSet", trainSet)
# save_to_pickle("algo_testSet", testSet)

In [89]:
#cross_validate(algo, raw_ratings, measures=['MSE', 'RMSE', 'MAE'], cv=5, verbose=True)

In [90]:
svd = SVD(random_state=10)
data_new = Dataset.load_from_df(data_rating[['customer_id', 'movie_id', 'rating']], reader)
trainset = data_new.build_full_trainset()
svd.fit(trainset)
svd_predictions = svd.test(testSet)
print_evaluation_accuracy(svd_predictions)



Evaluating accuracy of model...
RMSE:  0.670030900148855
MSE:  0.44894140715428493
MAE:  0.5221479828726231
FCP:  0.5221479828726231


In [91]:
#cross_validate(svd, data_new, measures=['MSE', 'RMSE', 'MAE'], cv=5, verbose=True)

In [92]:
#surprise.get_neighbours(1, 10)

In [93]:
##TODO: look into knows_user 
#       -> which tells if the user is part of the trainset,
#           -> equals true if user has at least one rating

### display recommendations

In [94]:
number_to_show = 10

#### customer = 79724

In [95]:
display_rated_content(customer_id=79724, number_to_show=number_to_show)

Movies/TV Shows rated by customer 79724
Empty DataFrame
Columns: [rating]
Index: []


In [96]:
display_recommendation(customer_id=79724, number_to_show=number_to_show, predictor=svd)

Movies/TV Shows recommended to customer
                                                    estimated_score
movie_title                                                        
Lost: Season 1                                             4.521422
The Simpsons: Season 6                                     4.369304
The Godfather                                              4.355742
Six Feet Under: Season 4                                   4.350956
The Silence of the Lambs                                   4.326521
Lord of the Rings: The Fellowship of the Ring              4.285705
Law & Order: Special Victims Unit: The Second Year         4.252380
The Simpsons: Season 3                                     4.203618
The West Wing: Season 3                                    4.195865
Nip/Tuck: Season 2                                         4.181070


In [97]:
display_recommendation(customer_id=79724, number_to_show=number_to_show, predictor=algo)

Movies/TV Shows recommended to customer
                                               estimated_score
movie_title                                                   
Lost: Season 1                                        4.542424
The Simpsons: Season 6                                4.391883
The Godfather                                         4.340253
Six Feet Under: Season 4                              4.320674
Lord of the Rings: The Fellowship of the Ring         4.278653
Family Guy: Freakin' Sweet Collection                 4.260342
The Simpsons: Treehouse of Horror                     4.247825
The Simpsons: Season 3                                4.246159
The Silence of the Lambs                              4.240932
Nip/Tuck: Season 2                                    4.239350


#### customer = customer_id_use

In [98]:
display_rated_content(customer_id=customer_id_use, number_to_show=number_to_show)

Movies/TV Shows rated by customer 534046
                                    rating
movie_title                               
The Rise and Fall of ECW               5.0
Casino: 10th Anniversary Edition       5.0
A Mighty Wind                          5.0
The Godfather                          5.0
Michael Moore Hates America            5.0
The Silence of the Lambs               5.0
Something's Gotta Give                 5.0
The Life Aquatic with Steve Zissou     5.0
Road to Perdition                      5.0
The Weather Underground                5.0


In [99]:
display_recommendation(customer_id=customer_id_use, number_to_show=number_to_show, predictor=svd)

Movies/TV Shows recommended to customer
                                   estimated_score
movie_title                                       
The Silence of the Lambs                  4.975386
Six Feet Under: Season 4                  4.920907
Curb Your Enthusiasm: Season 3            4.874172
The Simpsons: Season 6                    4.871290
The Godfather                             4.827125
The Simpsons: Season 3                    4.770718
The Life of Birds                         4.757527
The Life of Mammals                       4.720321
The Simpsons: Treehouse of Horror         4.706994
The Simpsons: Season 1                    4.687082


In [100]:
display_recommendation(customer_id=customer_id_use, number_to_show=number_to_show, predictor=algo)
# recommends for example: Reservoir Dogs, which the user has already rated

Movies/TV Shows recommended to customer
                                                    estimated_score
movie_title                                                        
The Simpsons: Season 6                                     5.000000
Lord of the Rings: The Fellowship of the Ring              5.000000
The Simpsons: Season 3                                     5.000000
The Silence of the Lambs                                   5.000000
The Godfather                                              4.965630
The Twilight Zone: Vol. 3                                  4.920416
Lost: Season 1                                             4.917772
Law & Order: Special Victims Unit: The Second Year         4.902102
The Twilight Zone: Vol. 42                                 4.847656
Aliens: Collector's Edition                                4.846262


### print accuracy

In [101]:
print_evaluation_accuracy(svd_predictions)


Evaluating accuracy of model...
RMSE:  0.670030900148855
MSE:  0.44894140715428493
MAE:  0.5221479828726231
FCP:  0.5221479828726231


In [102]:
print_evaluation_accuracy(algo_predictions)


Evaluating accuracy of model...
RMSE:  0.9048402795139483
MSE:  0.8187359314308801
MAE:  0.7014346782230714
FCP:  0.7014346782230714


In [103]:
test_against_our_custom_function() #for algo predictions


Evaluating accuracy of model...
RMSE:  0.9048402795139483
MSE:  0.8187359314308801
MAE:  0.7014346782230714
FCP:  0.7014346782230714


results are the same so we will not use the custom function since it adds no value

## other

In [39]:
def getMovieName(movieID):
  if int(movieID) in tmp_data_movies:
      return tmp_data_movies[int(movieID)]
  else:
      return ""

In [40]:
dataset = Dataset.load_from_df(data_rating[['customer_id', 'movie_id', 'rating']], reader)
fullTrainset = dataset.build_full_trainset()
fullTestSet = fullTrainset.build_anti_testset()
trainSet, testSet = train_test_split(dataset, test_size=.25, random_state=1)

In [41]:
# creating a dict of movie id and movie_title to make sure we don't recommend user something he has rated before
tmp_data_movies = data_movies[['movie_id', 'movie_title']]
tmp_data_movies = tmp_data_movies.set_index('movie_id').T
tmp_data_movies = tmp_data_movies.to_dict('list')
tmp_data_movies = {k: str(v[0]) for k,v in tmp_data_movies.items()}

In [42]:
data_movies

Unnamed: 0,movie_id,movie_year,movie_title
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004.0,Fidel Castro: American Experience
17767,17768,2000.0,Epoch
17768,17769,2003.0,The Company


In [43]:
tmp_data_movies

{1: 'Dinosaur Planet',
 2: 'Isle of Man TT 2004 Review',
 3: 'Character',
 4: "Paula Abdul's Get Up & Dance",
 5: 'The Rise and Fall of ECW',
 6: 'Sick',
 7: '8 Man',
 8: 'What the #$*! Do We Know!?',
 9: "Class of Nuke 'Em High 2",
 10: 'Fighter',
 11: 'Full Frame: Documentary Shorts',
 12: 'My Favorite Brunette',
 13: 'Lord of the Rings: The Return of the King: Extended Edition: Bonus Material',
 14: 'Nature: Antarctica',
 15: 'Neil Diamond: Greatest Hits Live',
 16: 'Screamers',
 17: '7 Seconds',
 18: 'Immortal Beloved',
 19: "By Dawn's Early Light",
 20: 'Seeta Aur Geeta',
 21: 'Strange Relations',
 22: 'Chump Change',
 23: "Clifford: Clifford Saves the Day! / Clifford's Fluffiest Friend Cleo",
 24: 'My Bloody Valentine',
 25: 'Inspector Morse 31: Death Is Now My Neighbour',
 26: 'Never Die Alone',
 27: "Sesame Street: Elmo's World: The Street We Live On",
 28: 'Lilo and Stitch',
 29: 'Boycott',
 30: "Something's Gotta Give",
 31: 'Classic Albums: Meat Loaf: Bat Out of Hell',
 32: 

In [56]:
similarity_matrix = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}).fit(fullTrainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


In [57]:
#save_to_pickle("knnbasic_similarity_matrix", similarity_matrix)

In [47]:
# generate_random_user_ids(1000, 1500)

In [48]:
# for i in generate_random_user_ids(1000, 1500):
#     if (len(all_custumers_id_rows(i)) > 3):
#         print(i)

In [49]:
test_subject = 596533#16272
k = 10

In [50]:
test_subject_iid = trainSet.to_inner_uid(test_subject)

test_subject_ratings = trainSet.ur[test_subject_iid]
k_neighbours = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])

In [51]:
def get_candidates(k_neighbours):
    candidates = defaultdict(float)
    for itemID, rating in k_neighbours:
        try:
            similarities = similarity_matrix[itemID]
            for innerID, score in enumerate(similarities):
                candidates[innerID] += score * (rating / 5.0)
        except:
            continue
    return candidates

candidates = get_candidates(k_neighbours)

In [52]:
def get_watched(trainset):
    watched = {}
    for itemID, rating in trainset.ur[test_subject_iid]:
        watched[itemID] = 1
    return watched
watched = get_watched(trainSet)


In [53]:
def get_recommendations(candidates):
    recommendations = []
    position = 0
    for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
        if not itemID in watched:
            recommendations.append(getMovieName(trainSet.to_raw_iid(itemID)))
            position += 1
            # only want top n which in our case in 10
            if(position > 10): break
    return recommendations
recommendations = get_recommendations(candidates)

In [54]:
for rec in recommendations:
    print("Movie: ", rec)

In [55]:
print("user", test_subject, 'previous recommendations')
display_recommendation(customer_id=test_subject, number_to_show=10, predictor=svd)

user 596533 previous recommendations


NameError: name 'svd' is not defined

In [None]:
#TODO:
#for the new user problem
## find highest rated movie / tv shows that have been watched the most -> how likely is new user to click on it ?

In [None]:
#cross_validate(similarity_matrix, dataset, measures=['MSE', 'RMSE', 'MAE'], cv=5, verbose=True)