In [1]:
#!pip install surprise
#!pip install import-ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle
#sns.set_style("darkgrid")
from surprise import accuracy
from collections import defaultdict
from surprise import KNNBasic

from collections import defaultdict
from operator import itemgetter
import heapq
import os
import csv

In [2]:
use_pickle_file = True

## Helper functions

In [3]:
def all_movie_id_rows(movie_id):
    return data_rating[data_rating['movie_id'] == movie_id]

def all_custumers_id_rows(customer_id):
    return data_rating[data_rating['customer_id'] == customer_id]

def all_customers_id_plus_movie_title_rows(customer_id):
    return data_rating_plus_movie_title[data_rating_plus_movie_title['customer_id'] == customer_id]

def all_movies_get_average_rating():
    movie_ratings_stats = data_rating.groupby('movie_id').agg({'rating': ['sum', 'count']}).reset_index()
    movie_ratings_stats['avg_rating'] =  movie_ratings_stats['rating']['sum'] / movie_ratings_stats['rating']['count']
    return movie_ratings_stats

def all_customers_get_average_rating():
    customers_ratings_stats = data_rating.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    return customers_ratings_stats

def all_movies_get_rated_count():
    return data_rating.groupby('movie_id').agg({'movie_id': 'count'}).reset_index()

def all_customers_get_movie_rated_count():
    return data_rating.groupby('customer_id').agg({'movie_id': 'count'}).reset_index()

def get_customers_avg_rating_less_than(max_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] < max_rating])

def get_movies_avg_rating_less_than(max_rating):
    print(all_movies_average_rating[all_movies_average_rating['avg_rating'] < max_rating])

def get_customers_avg_rating_higher_than(min_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] > min_rating])

def get_movies_avg_rating_higher_than(min_rating):
    print(all_movies_average_rating[all_movies_average_rating['avg_rating'] > min_rating])


def get_movie_avg_rating(movie_id):
    output = all_movies_average_rating[all_movies_average_rating['movie_id'] == movie_id]
    #print(output)
    return output['avg_rating']

def get_users_avg_rating(customer_id):
    output = all_customers_average_ratings[all_customers_average_ratings['customer_id'] == customer_id]
    #print(output)
    return output['avg_rating']

def get_movies_customer_rated_higher_than(customer_id, min_rating=4):
    temp = data_rating_plus_movie_title.copy()
    return temp[(temp['customer_id'] == customer_id) & (temp['rating'] >= min_rating)].set_index('movie_id')

def get_movies_customer_rated_lower_than(customer_id, max_rating=4):
    temp = data_rating_plus_movie_title.copy()
    return temp[(temp['customer_id'] == customer_id) & (temp['rating'] < max_rating)].set_index('movie_id')

def display_movies_customer_rated_higher_than(customer_id, min_rating=0):
    df_customer_liked = get_movies_customer_rated_higher_than(customer_id=customer_id, min_rating=min_rating)
    print(df_customer_liked[['movie_title', 'rating']])
    print('average rating', get_users_avg_rating(customer_id))

def display_movies_customer_rated_lower_than(customer_id, max_rating=0):
    df_customer_disliked = get_movies_customer_rated_lower_than(customer_id=customer_id, max_rating=max_rating)
    print(df_customer_disliked[['movie_title', 'rating']])
    print('average rating', get_users_avg_rating(customer_id))

def save_to_pickle(name, df):
    path_name = "pickle/"+name+".pickle"
    pickle_file = open(path_name,"wb")
    pickle.dump(df, pickle_file)
    pickle_file.close()

def load_pickle(name):
    path_name = "pickle/"+name+".pickle"
    return_input = open(path_name, "rb")
    return pickle.load(return_input)


## Get the data

In [4]:
def create_dataframe(use_pickle=True):
    if(use_pickle == True):
        in_pickle = open("pickle/movies_customers_ratings.pickle", "rb")
        data = pd.DataFrame(pickle.load(in_pickle), columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)
    else:
        index = 1
        last_movie_id = "1"
        new_data = []
        ## import all combined_data files to one large pandas dataframe
        ##> returns index, customer_id (which is both movie and customer), rating (NaN = customer_id is a movie, Not NaN = customers rating)
        df_all = pd.read_csv('./data/combined_data_1.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1])
        #df_all = df_all.append(pd.read_csv('./data/combined_data_2.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
        #df_all = df_all.append(pd.read_csv('./data/combined_data_3.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
        #df_all = df_all.append(pd.read_csv('./data/combined_data_4.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
        df_all.index = np.arange(0,len(df_all))
        df_all['rating'] = df_all['rating'].astype(float)
        
        for customer_id in df_all["customer_id"]:
            # if we find : that means this is a movie_id and not customer_id
            if(customer_id.find(":") > 0):
                movie_id = customer_id.replace(":", "")
                last_movie_id = movie_id
            else:
                # we have this row index so use it to get rating
                rating = df_all["rating"][index-1]
                new_data.append([last_movie_id, customer_id, rating, index])
            index += 1
        #output to pickle file
        movies_customers_ratings = open("pickle/movies_customers_ratings.pickle","wb")
        pickle.dump(new_data, movies_customers_ratings)
        movies_customers_ratings.close()
        data = pd.DataFrame(new_data, columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)

    # change columns to numerical
    data['movie_id'] = data['movie_id'].astype(int)
    data['customer_id'] = data['customer_id'].astype(int)
    data["rating"] = data["rating"].astype(float)
    return data

def get_dataframes(use_pickle=True):
    # dataframe containing all informations about the movies
    #> returns movie_id, movie_year, movie_title
    data_movies = pd.read_csv('./data/movie_titles.csv', header = None, names = ['movie_id', 'movie_year', 'movie_title'], usecols = [0,1,2], encoding="latin1")
    #data_movies_import.set_index('movie_id', inplace = True)

    ## ------------------------------------------------------------------------------------- ##

    # dataframe containing all informations about the movie ratings by customer
    #> returns index, movie_id, customer_id, rating
    data_rating = create_dataframe(use_pickle=use_pickle)

    # ## ------------------------------------------------------------------------------------- ##

    # ##combine customer ratings to movie titles
    # ##> returns index, movie_id, customer_id, rating, movie_year, movie_title
    data_rating_plus_movie_title = data_rating.merge(data_movies, on="movie_id", how="inner")

    ## ------------------------------------------------------------------------------------- ##

    data_movies_categorize = pd.read_csv('./data/movies.csv', header = None, names = ['movie_id', 'movie_title', 'genres'], usecols = [0,1,2], encoding="latin1")[1:] #dataset is off by one
    #data_movies_categorize.set_index('movie_id', inplace = True)
    data_movies_categorize_split = data_movies_categorize['movie_title'].str.split('(', n = 1, expand=True) # split movie_title to movie_title and movie_year
    data_movies_categorize_split[1] = data_movies_categorize_split[1].str.replace(r')', '') #removing ) at the end of movie_year
    data_movies_categorize["movie_year"] = data_movies_categorize_split[1]
    data_movies_categorize["movie_title"] = data_movies_categorize_split[0]
    #data_movies_categorize["movie_year"] = data_movies_categorize["movie_year"].astype(float)
    #data_movies_categorize
    data_movies_categorize_cleaned = data_movies_categorize[pd.to_numeric(data_movies_categorize['movie_year'], errors='coerce').notnull()]
    data_movies_categorize_cleaned["movie_year"] = data_movies_categorize_cleaned["movie_year"].astype(float)
    
    return data_movies, data_rating, data_rating_plus_movie_title, data_movies_categorize_cleaned


def get_users_loved_hated_movies(customer_id, minmax_rating):
    users_ratings_higher_than_four = get_movies_customer_rated_higher_than(customer_id=customer_id_use, min_rating=minmax_rating)
    users_ratings_lower_than_four = get_movies_customer_rated_lower_than(customer_id=customer_id_use, max_rating=minmax_rating)
    print("User", customer_id_use ,"loved these movies")
    for rating in users_ratings_higher_than_four['movie_title']:
        print(rating)
    print('')
    print("and disliked these movies")
    for rating in users_ratings_lower_than_four['movie_title']:
        print(rating)

in these txt files the movie_id's have been placed amongst the customer_id's, which means that we have to split that column into two, we do this by taking all ratings that are equal to NaN 
( meaning that we are now refering to a movie and not a customer ) and from there we split it into two columns.

In [5]:
data_movies, data_rating, data_rating_plus_movie_title, data_movies_categorized = get_dataframes(use_pickle=use_pickle_file)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_movies_categorize_cleaned["movie_year"] = data_movies_categorize_cleaned["movie_year"].astype(float)


In [6]:
max_n = 250000
data_rating = data_rating[:max_n]
data_rating_plus_movie_title = data_rating_plus_movie_title[:max_n]

#### try again later

In [7]:
# # temporarly while join with genre not working correctly
# data_movies = data_movies_import.copy()
# data_movies.set_index('movie_id', inplace = True)
# print(data_movies)

In [8]:
# x_temp = data_movies_import
# x2_temp= data_movies_categorize
# #x_temp.set_index("movie_id", inplace=True)
# #x2_temp.set_index("movie_id", inplace=True)

In [9]:
# temp_merge = pd.merge(x_temp, x2_temp, on=['movie_title', 'movie_year'], how='outer')
# temp_merge = temp_merge[["movie_year", "movie_title"]].drop_duplicates(subset=None, keep="last", inplace=False)

In [10]:
# df_merged = pd.merge(x_temp, x2_temp, on=["movie_title", "movie_year"], how="left")

In [11]:
#df_merged["genres"].isnull()

In [12]:
# #data_movies = pd.merge(x, x2, on=['movie_year', 'movie_title'], how='outer')
# data_movies = pd.merge(x, x2, on=["movie_title", "movie_year"], how='outer')
# data_movies#.set_index('movie_id', inplace = True)


In [13]:
# data_movies.groupby("movie_title").count()

#### end of try again later

## Working with the data

### Details about all customers/movies

In [14]:
# get the average movie rating for all customers
# used to determine if this user typically gives bad or good reviews
# and then we can see if he really hates or loves a movie
all_customers_average_ratings = all_customers_get_average_rating()
print(all_customers_average_ratings)

       customer_id rating       avg_rating
                      sum count           
0                6    3.0     1   3.000000
1                7   14.0     3   4.666667
2               42    4.0     1   4.000000
3               59    2.0     1   2.000000
4               79    3.0     1   3.000000
...            ...    ...   ...        ...
169942     2649376    5.0     1   5.000000
169943     2649378    6.0     2   3.000000
169944     2649388    3.0     1   3.000000
169945     2649426   12.0     3   4.000000
169946     2649429    5.0     1   5.000000

[169947 rows x 4 columns]


In [15]:
all_movies_average_rating = all_movies_get_average_rating()
print(all_movies_average_rating)

   movie_id    rating         avg_rating
                  sum   count           
0         1    2051.0     547   3.749543
1         2     516.0     145   3.558621
2         3    7326.0    2012   3.641153
3         4     389.0     142   2.739437
4         5    4468.0    1140   3.919298
5         6    3143.0    1019   3.084396
6         7     198.0      93   2.129032
7         8   47560.0   14910   3.189805
8         9     249.0      95   2.621053
9        10     792.0     249   3.180723
10       11     600.0     198   3.030303
11       12    1866.0     546   3.417582
12       13     569.0     125   4.552000
13       14     357.0     118   3.025424
14       15     953.0     290   3.286207
15       16    8363.0    2699   3.098555
16       17   20636.0    7108   2.903208
17       18   40576.0   10722   3.784369
18       19    1792.0     539   3.324675
19       20     365.0     116   3.146552
20       21     755.0     218   3.463303
21       22     456.0     203   2.246305
22       23    2

In [16]:
get_customers_avg_rating_less_than(max_rating=5)

       customer_id rating       avg_rating
                      sum count           
0                6    3.0     1   3.000000
1                7   14.0     3   4.666667
2               42    4.0     1   4.000000
3               59    2.0     1   2.000000
4               79    3.0     1   3.000000
...            ...    ...   ...        ...
169940     2649351    3.0     1   3.000000
169941     2649375    4.0     1   4.000000
169943     2649378    6.0     2   3.000000
169944     2649388    3.0     1   3.000000
169945     2649426   12.0     3   4.000000

[138194 rows x 4 columns]


In [17]:
get_customers_avg_rating_less_than(max_rating=4)

       customer_id rating       avg_rating
                      sum count           
0                6    3.0     1   3.000000
3               59    2.0     1   2.000000
4               79    3.0     1   3.000000
9              188    3.0     1   3.000000
17             296    7.0     2   3.500000
...            ...    ...   ...        ...
169933     2649285    7.0     2   3.500000
169934     2649296   10.0     3   3.333333
169940     2649351    3.0     1   3.000000
169943     2649378    6.0     2   3.000000
169944     2649388    3.0     1   3.000000

[73394 rows x 4 columns]


In [18]:
get_customers_avg_rating_higher_than(min_rating=4)

       customer_id rating       avg_rating
                      sum count           
1                7   14.0     3   4.666667
6              134    5.0     1   5.000000
10             199    5.0     1   5.000000
11             201    9.0     2   4.500000
15             268    5.0     1   5.000000
...            ...    ...   ...        ...
169931     2649267   10.0     2   5.000000
169932     2649268    5.0     1   5.000000
169938     2649331    5.0     1   5.000000
169942     2649376    5.0     1   5.000000
169946     2649429    5.0     1   5.000000

[40010 rows x 4 columns]


In [19]:
get_movies_avg_rating_less_than(max_rating=5)

   movie_id    rating         avg_rating
                  sum   count           
0         1    2051.0     547   3.749543
1         2     516.0     145   3.558621
2         3    7326.0    2012   3.641153
3         4     389.0     142   2.739437
4         5    4468.0    1140   3.919298
5         6    3143.0    1019   3.084396
6         7     198.0      93   2.129032
7         8   47560.0   14910   3.189805
8         9     249.0      95   2.621053
9        10     792.0     249   3.180723
10       11     600.0     198   3.030303
11       12    1866.0     546   3.417582
12       13     569.0     125   4.552000
13       14     357.0     118   3.025424
14       15     953.0     290   3.286207
15       16    8363.0    2699   3.098555
16       17   20636.0    7108   2.903208
17       18   40576.0   10722   3.784369
18       19    1792.0     539   3.324675
19       20     365.0     116   3.146552
20       21     755.0     218   3.463303
21       22     456.0     203   2.246305
22       23    2

In [20]:
get_movies_avg_rating_less_than(max_rating=4)

   movie_id    rating         avg_rating
                  sum   count           
0         1    2051.0     547   3.749543
1         2     516.0     145   3.558621
2         3    7326.0    2012   3.641153
3         4     389.0     142   2.739437
4         5    4468.0    1140   3.919298
5         6    3143.0    1019   3.084396
6         7     198.0      93   2.129032
7         8   47560.0   14910   3.189805
8         9     249.0      95   2.621053
9        10     792.0     249   3.180723
10       11     600.0     198   3.030303
11       12    1866.0     546   3.417582
13       14     357.0     118   3.025424
14       15     953.0     290   3.286207
15       16    8363.0    2699   3.098555
16       17   20636.0    7108   2.903208
17       18   40576.0   10722   3.784369
18       19    1792.0     539   3.324675
19       20     365.0     116   3.146552
20       21     755.0     218   3.463303
21       22     456.0     203   2.246305
22       23    2187.0     615   3.556098
23       24    3

In [21]:
count_movies_rated_by_customer = all_customers_get_movie_rated_count()
count_movies_rated_by_customer

Unnamed: 0,customer_id,movie_id
0,6,1
1,7,3
2,42,1
3,59,1
4,79,1
...,...,...
169942,2649376,1
169943,2649378,2
169944,2649388,1
169945,2649426,3


In [22]:
def generate_random_user_ids(from_n, to_n):
    ids = np.unique(data_rating['customer_id'])[from_n:to_n]
    print("random user id used while in development\n")
    return ids

In [23]:
generate_random_user_ids(1000, 1500)

random user id used while in development



array([16121, 16144, 16146, 16150, 16154, 16166, 16168, 16178, 16184,
       16189, 16201, 16211, 16229, 16239, 16241, 16272, 16273, 16287,
       16313, 16326, 16345, 16361, 16364, 16386, 16388, 16408, 16427,
       16455, 16456, 16475, 16479, 16487, 16525, 16540, 16558, 16565,
       16581, 16583, 16588, 16591, 16599, 16631, 16649, 16651, 16708,
       16719, 16723, 16736, 16747, 16786, 16789, 16792, 16800, 16818,
       16819, 16833, 16837, 16855, 16962, 17008, 17035, 17041, 17042,
       17062, 17063, 17064, 17078, 17083, 17088, 17119, 17128, 17136,
       17138, 17140, 17147, 17149, 17151, 17181, 17184, 17189, 17227,
       17246, 17252, 17268, 17274, 17288, 17299, 17321, 17336, 17369,
       17375, 17382, 17385, 17433, 17451, 17462, 17503, 17510, 17519,
       17530, 17576, 17634, 17643, 17665, 17672, 17690, 17691, 17697,
       17699, 17707, 17713, 17743, 17745, 17836, 17845, 17864, 17868,
       17890, 17914, 17916, 17919, 17924, 17946, 17949, 17957, 17982,
       17984, 17995,

### Details about a specific customer

In [24]:
# ids I like -> 28812, 56514, 56520, 1488844
customer_id_use = 84100

In [25]:
#all_custumers_id_rows(customer_id=customer_id_use)
## same as line above but including movie_year and movie_title
all_customers_id_plus_movie_title_rows(customer_id=customer_id_use)

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
141847,30,84100,5.0,2003.0,Something's Gotta Give


In [26]:
get_users_avg_rating(customer_id=customer_id_use)

5287    5.0
Name: avg_rating, dtype: float64

In [27]:
display_movies_customer_rated_higher_than(customer_id=customer_id_use, min_rating=4)

                     movie_title  rating
movie_id                                
30        Something's Gotta Give     5.0
average rating 5287    5.0
Name: avg_rating, dtype: float64


In [28]:
display_movies_customer_rated_lower_than(customer_id=customer_id_use, max_rating=4)

Empty DataFrame
Columns: [movie_title, rating]
Index: []
average rating 5287    5.0
Name: avg_rating, dtype: float64


In [29]:
get_users_loved_hated_movies(customer_id=customer_id_use, minmax_rating=4)

User 84100 loved these movies
Something's Gotta Give

and disliked these movies


In [30]:

data_rating

Unnamed: 0,movie_id,customer_id,rating
0,1,1488844,3.0
1,1,822109,5.0
2,1,885013,4.0
3,1,30878,4.0
4,1,823519,3.0
...,...,...,...
249995,52,1691722,4.0
249996,52,1034571,5.0
249997,52,684917,4.0
249998,52,1810496,2.0


## Recommendations with collaborative filtering

In [31]:
def build_training_set(df):
    reader = Reader()
    svd = SVD()
    data_new = Dataset.load_from_df(df[['customer_id', 'movie_id', 'rating']], reader)
    trainset = data_new.build_full_trainset()
    svd.fit(trainset)
    return trainset, svd

def get_summary(type="movie_id"):
    #>IF MOVIE:  movie_id, movie rating count, movie rating mean
    #>IF CUSTOMER: customer_id, custumer rating count, customer rating mean
    df_count_mean_summary = data_rating.groupby(type)['rating'].agg(['count', 'mean'])
    df_count_mean_summary.index = df_count_mean_summary.index.map(int)
    #>IF MOVIE: returns 1799.0 as a benchmark number
    #IF CUSTOMER: returns 52.0 as a benchmark number
    benchmark = round(df_count_mean_summary['count'].quantile(0.7),0)
    # drop all rows below benchmark
    df_drop_list = df_count_mean_summary[df_count_mean_summary['count'] < benchmark]
    # return all indexes to drop
    return df_drop_list

def get_customer_recommendations(customer_id, predictor):
    #> returns movie_id, movie_year, movie_title
    chosen_customer_pred = data_movies.copy()
    # fails if movie_id is the index so we have to reset the index back to normal (0-N)
    chosen_customer_pred = chosen_customer_pred.reset_index()
    # makes sure that we only pick movies that are not in the movie dropped list
    chosen_customer_pred = chosen_customer_pred[~chosen_customer_pred['movie_id'].isin(df_movie_drop_list)]
    # make prediction for customer with id = <customer_id> and put it into 'estimated_score'
    chosen_customer_pred['estimated_score'] = chosen_customer_pred['movie_id'].apply(lambda x: predictor.predict(customer_id, x).est)
    # sort by 'estimated score'
    chosen_customer_pred = chosen_customer_pred.sort_values('estimated_score', ascending=False).set_index('movie_id')
    return chosen_customer_pred

def display_rated_content(customer_id=customer_id_use, number_to_show=20):
    print("Movies/TV Shows rated by customer", customer_id)
    df = all_customers_id_plus_movie_title_rows(customer_id=customer_id)
    df = df[['movie_title', 'rating']].sort_values('rating', ascending=False)
    print(df[0:number_to_show].set_index('movie_title'))

def display_customers_recommendations(customer_id=customer_id_use, df=[], number_to_show=20):
    print("Movies/TV Shows recommended to customer")
    print(df[['movie_title', 'estimated_score']][0:number_to_show].set_index('movie_title'))

def display_history_plus_recommended(customer_id=customer_id_use, number_to_show=25):
    display_rated_content(customer_id=customer_id, number_to_show=number_to_show)
    chosen_customer_pred = get_customer_recommendations(customer_id=customer_id, predictor=svd)
    display_customers_recommendations(customer_id=customer_id, df=chosen_customer_pred, number_to_show=number_to_show)

def get_evaluation_and_svd(use_pickle=True):
    evaluationData_file_name = "evaluationData_build_training_set"
    svd_file_name = "svd_build_training_set"
    if(use_pickle == True):
        evaluationData = load_pickle(evaluationData_file_name)
        svd = load_pickle(svd_file_name)
        return evaluationData, svd
    else:
        evaluationData, svd = build_training_set(data_rating)
        save_to_pickle(evaluationData_file_name, evaluationData)
        save_to_pickle(svd_file_name, svd)
        return evaluationData, svd

In [32]:
evaluationData, svd = get_evaluation_and_svd(use_pickle=use_pickle_file)

In [33]:
## creating a list of the movies that we don't wanna include
df_movie_drop_list = get_summary(type="movie_id")
#df_customer_drop_list = get_summary(type="customer_id")

In [34]:
display_history_plus_recommended(customer_id=79724, number_to_show=10)

Movies/TV Shows rated by customer 79724
                            rating
movie_title                       
What the #$*! Do We Know!?     4.0
Movies/TV Shows recommended to customer
                                                    estimated_score
movie_title                                                        
Aqua Teen Hunger Force: Vol. 1                             5.000000
Lord of the Rings: The Return of the King: Exte...         4.490710
Rudolph the Red-Nosed Reindeer                             4.346710
ABC Primetime: Mel Gibson's The Passion of the ...         4.163294
The Weather Underground                                    4.129848
Boycott                                                    4.119695
Zatoichi's Conspiracy                                      4.085259
Inspector Morse 31: Death Is Now My Neighbour              4.072674
The Rise and Fall of ECW                                   4.014325
Lilo and Stitch                                            3.900466

In [35]:
display_history_plus_recommended(customer_id=customer_id_use, number_to_show=10)

Movies/TV Shows rated by customer 84100
                        rating
movie_title                   
Something's Gotta Give     5.0
Movies/TV Shows recommended to customer
                                                    estimated_score
movie_title                                                        
Lord of the Rings: The Return of the King: Exte...         4.657058
Spitfire Grill                                             4.183466
Zatoichi's Conspiracy                                      4.096977
Aqua Teen Hunger Force: Vol. 1                             4.050542
ABC Primetime: Mel Gibson's The Passion of the ...         4.047677
Inspector Morse 31: Death Is Now My Neighbour              4.040900
Dinosaur Planet                                            3.982314
Lilo and Stitch                                            3.953918
The Rise and Fall of ECW                                   3.922879
Immortal Beloved                                           3.896705


In [36]:
#TODO:
#for the new user problem
## find highest rated movie / tv shows that have been watched the most -> how likely is new user to click on it ?

## SOMETHING

In [37]:
from surprise import dataset
from surprise import KNNBaseline
from surprise.model_selection import train_test_split 
from surprise.model_selection import LeaveOneOut
from surprise import accuracy

In [38]:
def build_training_set_same_as_above(df):
    reader = Reader()
    svd = SVD()
    data_new = Dataset.load_from_df(df[['customer_id', 'movie_id', 'rating']], reader)
    trainset = data_new.build_full_trainset()
    svd.fit(trainset)
    return trainset, svd

def print_evaluation_accuracy(prediction):
    print("\nEvaluating accuracy of model...")
    print("RMSE: ", accuracy.rmse(prediction, verbose=False))
    print("MAE: ", accuracy.mae(prediction, verbose=False))

In [39]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))
class convert_to_raw_ratings(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in zip(df['customer_id'], df['movie_id'], df['rating'])]
        self.reader=reader

raw_ratings = convert_to_raw_ratings(data_rating, reader)
print("\nBuilding recommendation model...")
trainSet, testSet = train_test_split(raw_ratings, test_size=.25, random_state=1)


Building recommendation model...


In [40]:
algo = SVD(random_state=10)
algo.fit(trainSet)

print("\nComputing recommendations...")
predictions = algo.test(testSet)


Computing recommendations...


In [41]:
cross_validate(algo,  raw_ratings, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0061  1.0043  1.0051  1.0122  1.0058  1.0067  0.0028  
MAE (testset)     0.7860  0.7829  0.7833  0.7908  0.7856  0.7857  0.0028  
Fit time          9.53    9.23    9.23    9.33    9.30    9.33    0.11    
Test time         0.47    0.45    0.25    0.24    0.25    0.33    0.11    


{'test_rmse': array([1.00611476, 1.00434891, 1.00505026, 1.0121597 , 1.00580061]),
 'test_mae': array([0.78603597, 0.78290394, 0.78333974, 0.79077806, 0.78558601]),
 'fit_time': (9.53354024887085,
  9.234821557998657,
  9.22996187210083,
  9.327168464660645,
  9.304248332977295),
 'test_time': (0.47446417808532715,
  0.4479801654815674,
  0.24709725379943848,
  0.23809337615966797,
  0.2474501132965088)}

In [42]:
print_evaluation_accuracy(predictions)


Evaluating accuracy of model...
RMSE:  1.0100800835447845
MAE:  0.7994188019027819


In [43]:
print_evaluation_accuracy(svd.test(testSet))


Evaluating accuracy of model...
RMSE:  0.7603538739190463
MAE:  0.5983802863056151


## other

In [44]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))
dataset = Dataset.load_from_df(data_rating[['customer_id', 'movie_id', 'rating']], reader)
trainset = dataset.build_full_trainset()

In [45]:
tmp_data_movies = data_movies[['movie_id', 'movie_title']]
tmp_data_movies = tmp_data_movies.set_index('movie_id').T
tmp_data_movies = tmp_data_movies.to_dict('list')
tmp_data_movies = {k: str(v[0]) for k,v in tmp_data_movies.items()}

In [46]:
similarity_matrix = KNNBasic(sim_options={'name': 'cosine', 'user_based': False}).fit(trainset).compute_similarities()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


In [47]:
# generate_random_user_ids(1000, 1500)

In [48]:
# for i in generate_random_user_ids(1000, 1500):
#     if (len(all_custumers_id_rows(i)) > 3):
#         print(i)

In [49]:
test_subject = 16272
k = 10

In [50]:
test_subject_iid = trainset.to_inner_uid(test_subject)

test_subject_ratings = trainset.ur[test_subject_iid]
k_neighbours = heapq.nlargest(k, test_subject_ratings, key=lambda t: t[1])

In [51]:
candidates = defaultdict(float)

for itemID, rating in k_neighbours:
    try:
        similarities = similarity_matrix[itemID]
        for innerID, score in enumerate(similarities):
            candidates[innerID] += score * (rating / 5.0)
    except:
        continue

In [52]:
def getMovieName(movieID):
  if int(movieID) in tmp_data_movies:
    return tmp_data_movies[int(movieID)]
  else:
      return ""

In [53]:
watched = {}
for itemID, rating in trainset.ur[test_subject_iid]:
    watched[itemID] = 1

recommendations = []
position = 0
for itemID, rating_sum in sorted(candidates.items(), key=itemgetter(1), reverse=True):
    if not itemID in watched:
        recommendations.append(getMovieName(trainset.to_raw_iid(itemID)))
        position += 1
        # only want top n which in our case in 10
        if(position > 10): break

for rec in recommendations:
    print("Movie: ", rec)


Movie:  The Weather Underground
Movie:  Full Frame: Documentary Shorts
Movie:  Searching for Paradise
Movie:  Isle of Man TT 2004 Review
Movie:  Inspector Morse 31: Death Is Now My Neighbour
Movie:  Zatoichi's Conspiracy
Movie:  My Favorite Brunette
Movie:  Ashtanga Yoga: Beginner's Practice with Nicki Doane
Movie:  ABC Primetime: Mel Gibson's The Passion of the Christ
Movie:  Boycott
Movie:  Rudolph the Red-Nosed Reindeer


In [54]:
# #TODO: keep or throw away since it's never used
# sim_options = {'name': 'pearson_baseline', 'user_based': False}
# simsAlgo = KNNBaseline(sim_options=sim_options)
# simsAlgo.fit(evaluationData)