In [96]:
#!pip install surprise
#!pip install import-ipynb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle
#sns.set_style("darkgrid")
from surprise import accuracy

## Helper functions

In [100]:
def all_movie_id_rows(movie_id):
    return data_rating[data_rating['movie_id'] == movie_id]

def all_custumers_id_rows(customer_id):
    return data_rating[data_rating['customer_id'] == customer_id]

def all_customers_id_plus_movie_title_rows(customer_id):
    return data_rating_plus_movie_title[data_rating_plus_movie_title['customer_id'] == customer_id]

def all_movies_get_average_rating():
    movie_ratings_stats = data_rating.groupby('movie_id').agg({'rating': ['sum', 'count']}).reset_index()
    movie_ratings_stats['avg_rating'] =  movie_ratings_stats['rating']['sum'] / movie_ratings_stats['rating']['count']
    return movie_ratings_stats

def all_customers_get_average_rating():
    customers_ratings_stats = data_rating.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    return customers_ratings_stats

def all_movies_get_rated_count():
    return data_rating.groupby('movie_id').agg({'movie_id': 'count'}).reset_index()

def all_customers_get_movie_rated_count():
    return data_rating.groupby('customer_id').agg({'movie_id': 'count'}).reset_index()

def get_customers_avg_rating_less_than(max_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] < max_rating])

def get_movies_avg_rating_less_than(max_rating):
    print(all_movies_average_rating[all_movies_average_rating['avg_rating'] < max_rating])

def get_customers_avg_rating_higher_than(min_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] > min_rating])

def get_movies_avg_rating_higher_than(min_rating):
    print(all_movies_average_rating[all_movies_average_rating['avg_rating'] > min_rating])


def get_movie_avg_rating(movie_id):
    output = all_movies_average_rating[all_movies_average_rating['movie_id'] == movie_id]
    #print(output)
    return output['avg_rating']

def get_users_avg_rating(customer_id):
    output = all_customers_average_ratings[all_customers_average_ratings['customer_id'] == customer_id]
    #print(output)
    return output['avg_rating']

def get_movies_customer_rated_higher_than(customer_id, min_rating=4):
    temp = data_rating_plus_movie_title.copy()
    return temp[(temp['customer_id'] == customer_id) & (temp['rating'] >= min_rating)].set_index('movie_id')

def get_movies_customer_rated_lower_than(customer_id, max_rating=4):
    temp = data_rating_plus_movie_title.copy()
    return temp[(temp['customer_id'] == customer_id) & (temp['rating'] < max_rating)].set_index('movie_id')

def display_movies_customer_rated_higher_than(customer_id, min_rating=0):
    df_customer_liked = get_movies_customer_rated_higher_than(customer_id=customer_id, min_rating=min_rating)
    print(df_customer_liked[['movie_title', 'rating']])
    print('average rating', get_users_avg_rating(customer_id))

def display_movies_customer_rated_lower_than(customer_id, max_rating=0):
    df_customer_disliked = get_movies_customer_rated_lower_than(customer_id=customer_id, max_rating=max_rating)
    print(df_customer_disliked[['movie_title', 'rating']])
    print('average rating', get_users_avg_rating(customer_id))

def save_to_pickle(name, df):
    path_name = "pickle/"+name+".pickle"
    pickle_file = open(path_name,"wb")
    pickle.dump(df, pickle_file)
    pickle_file.close()

def load_pickle(name):
    path_name = "pickle/"+name+".pickle"
    return_input = open(path_name, "rb")
    return pickle.load(return_input)


## Get the data

In [132]:
def create_dataframe(use_pickle=True):
    if(use_pickle == True):
        in_pickle = open("pickle/movies_customers_ratings.pickle", "rb")
        data = pd.DataFrame(pickle.load(in_pickle), columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)
    else:
        index = 1
        last_movie_id = "1"
        new_data = []
        ## import all combined_data files to one large pandas dataframe
        ##> returns index, customer_id (which is both movie and customer), rating (NaN = customer_id is a movie, Not NaN = customers rating)
        df_all = pd.read_csv('./data/combined_data_1.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1])
        #df_all = df_all.append(pd.read_csv('./data/combined_data_2.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
        #df_all = df_all.append(pd.read_csv('./data/combined_data_3.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
        #df_all = df_all.append(pd.read_csv('./data/combined_data_4.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
        df_all.index = np.arange(0,len(df_all))
        df_all['rating'] = df_all['rating'].astype(float)
        
        for customer_id in df_all["customer_id"]:
            # if we find : that means this is a movie_id and not customer_id
            if(customer_id.find(":") > 0):
                movie_id = customer_id.replace(":", "")
                last_movie_id = movie_id
            else:
                # we have this row index so use it to get rating
                rating = df_all["rating"][index-1]
                new_data.append([last_movie_id, customer_id, rating, index])
            index += 1
        #output to pickle file
        movies_customers_ratings = open("pickle/movies_customers_ratings.pickle","wb")
        pickle.dump(new_data, movies_customers_ratings)
        movies_customers_ratings.close()
        data = pd.DataFrame(new_data, columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)

    # change columns to numerical
    data['movie_id'] = data['movie_id'].astype(int)
    data['customer_id'] = data['customer_id'].astype(int)
    data["rating"] = data["rating"].astype(float)
    return data

def get_dataframes(use_pickle=True):
    # dataframe containing all informations about the movies
    #> returns movie_id, movie_year, movie_title
    data_movies = pd.read_csv('./data/movie_titles.csv', header = None, names = ['movie_id', 'movie_year', 'movie_title'], usecols = [0,1,2], encoding="latin1")
    #data_movies_import.set_index('movie_id', inplace = True)

    ## ------------------------------------------------------------------------------------- ##

    # dataframe containing all informations about the movie ratings by customer
    #> returns index, movie_id, customer_id, rating
    data_rating = create_dataframe(use_pickle=use_pickle)

    # ## ------------------------------------------------------------------------------------- ##

    # ##combine customer ratings to movie titles
    # ##> returns index, movie_id, customer_id, rating, movie_year, movie_title
    data_rating_plus_movie_title = data_rating.merge(data_movies, on="movie_id", how="inner")

    ## ------------------------------------------------------------------------------------- ##

    data_movies_categorize = pd.read_csv('./data/movies.csv', header = None, names = ['movie_id', 'movie_title', 'genres'], usecols = [0,1,2], encoding="latin1")[1:] #dataset is off by one
    #data_movies_categorize.set_index('movie_id', inplace = True)
    data_movies_categorize_split = data_movies_categorize['movie_title'].str.split('(', n = 1, expand=True) # split movie_title to movie_title and movie_year
    data_movies_categorize_split[1] = data_movies_categorize_split[1].str.replace(r')', '') #removing ) at the end of movie_year
    data_movies_categorize["movie_year"] = data_movies_categorize_split[1]
    data_movies_categorize["movie_title"] = data_movies_categorize_split[0]
    #data_movies_categorize["movie_year"] = data_movies_categorize["movie_year"].astype(float)
    #data_movies_categorize
    data_movies_categorize_cleaned = data_movies_categorize[pd.to_numeric(data_movies_categorize['movie_year'], errors='coerce').notnull()]
    data_movies_categorize_cleaned["movie_year"] = data_movies_categorize_cleaned["movie_year"].astype(float)
    
    return data_movies, data_rating, data_rating_plus_movie_title, data_movies_categorize_cleaned


def get_users_loved_hated_movies(customer_id=customer_id_to_use, minmax_rating = minmax_rating):
    users_ratings_higher_than_four = get_movies_customer_rated_higher_than(customer_id=customer_id_use, min_rating=minmax_rating)
    users_ratings_lower_than_four = get_movies_customer_rated_lower_than(customer_id=customer_id_use, max_rating=minmax_rating)
    print("User", customer_id_use ,"loved these movies")
    for rating in users_ratings_higher_than_four['movie_title']:
        print(rating)
    print('')
    print("and disliked these movies")
    for rating in users_ratings_lower_than_four['movie_title']:
        print(rating)

in these txt files the movie_id's have been placed amongst the customer_id's, which means that we have to split that column into two, we do this by taking all ratings that are equal to NaN 
( meaning that we are now refering to a movie and not a customer ) and from there we split it into two columns.

In [4]:
data_movies, data_rating, data_rating_plus_movie_title, data_movies_categorized = get_dataframes(use_pickle=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_movies_categorize_cleaned["movie_year"] = data_movies_categorize_cleaned["movie_year"].astype(float)


#### try again later

In [5]:
# # temporarly while join with genre not working correctly
# data_movies = data_movies_import.copy()
# data_movies.set_index('movie_id', inplace = True)
# print(data_movies)

In [6]:
# x_temp = data_movies_import
# x2_temp= data_movies_categorize
# #x_temp.set_index("movie_id", inplace=True)
# #x2_temp.set_index("movie_id", inplace=True)

In [7]:
# temp_merge = pd.merge(x_temp, x2_temp, on=['movie_title', 'movie_year'], how='outer')
# temp_merge = temp_merge[["movie_year", "movie_title"]].drop_duplicates(subset=None, keep="last", inplace=False)

In [8]:
# df_merged = pd.merge(x_temp, x2_temp, on=["movie_title", "movie_year"], how="left")

In [9]:
#df_merged["genres"].isnull()

In [10]:
# #data_movies = pd.merge(x, x2, on=['movie_year', 'movie_title'], how='outer')
# data_movies = pd.merge(x, x2, on=["movie_title", "movie_year"], how='outer')
# data_movies#.set_index('movie_id', inplace = True)


In [11]:
# data_movies.groupby("movie_title").count()

#### end of try again later

## Working with the data

### Details about all customers/movies

In [58]:
# get the average movie rating for all customers
# used to determine if this user typically gives bad or good reviews
# and then we can see if he really hates or loves a movie
all_customers_average_ratings = all_customers_get_average_rating()
print(all_customers_average_ratings)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
1                7  793.0   195   4.066667
2                8   84.0    21   4.000000
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
...            ...    ...   ...        ...
470753     2649404   49.0    12   4.083333
470754     2649409   40.0    10   4.000000
470755     2649421   15.0     3   5.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[470758 rows x 4 columns]


In [71]:
all_movies_average_rating = all_movies_get_average_rating()
print(all_movies_average_rating)

     movie_id   rating       avg_rating
                   sum count           
0           1   2051.0   547   3.749543
1           2    516.0   145   3.558621
2           3   7326.0  2012   3.641153
3           4    389.0   142   2.739437
4           5   4468.0  1140   3.919298
...       ...      ...   ...        ...
4494     4495   2136.0   614   3.478827
4495     4496  35820.0  9519   3.763000
4496     4497   1939.0   714   2.715686
4497     4498    663.0   269   2.464684
4498     4499   1119.0   428   2.614486

[4499 rows x 4 columns]


In [72]:
get_customers_avg_rating_less_than(max_rating=5)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
1                7  793.0   195   4.066667
2                8   84.0    21   4.000000
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
...            ...    ...   ...        ...
470752     2649401  118.0    30   3.933333
470753     2649404   49.0    12   4.083333
470754     2649409   40.0    10   4.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[459896 rows x 4 columns]


In [73]:
get_customers_avg_rating_less_than(max_rating=4)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
5               33   38.0    11   3.454545
6               42  142.0    36   3.944444
...            ...    ...   ...        ...
470747     2649376  133.0    34   3.911765
470748     2649378  233.0    72   3.236111
470750     2649384   23.0     7   3.285714
470751     2649388  257.0    77   3.337662
470752     2649401  118.0    30   3.933333

[331845 rows x 4 columns]


In [74]:
get_customers_avg_rating_higher_than(min_rating=4)

       customer_id rating       avg_rating
                      sum count           
1                7  793.0   195   4.066667
7               59  198.0    49   4.040816
9               83   45.0    10   4.500000
13             116  185.0    45   4.111111
14             126   72.0    15   4.800000
...            ...    ...   ...        ...
470749     2649379    9.0     2   4.500000
470753     2649404   49.0    12   4.083333
470755     2649421   15.0     3   5.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[111634 rows x 4 columns]


In [75]:
get_movies_avg_rating_less_than(max_rating=5)

     movie_id   rating       avg_rating
                   sum count           
0           1   2051.0   547   3.749543
1           2    516.0   145   3.558621
2           3   7326.0  2012   3.641153
3           4    389.0   142   2.739437
4           5   4468.0  1140   3.919298
...       ...      ...   ...        ...
4494     4495   2136.0   614   3.478827
4495     4496  35820.0  9519   3.763000
4496     4497   1939.0   714   2.715686
4497     4498    663.0   269   2.464684
4498     4499   1119.0   428   2.614486

[4499 rows x 4 columns]


In [76]:
get_movies_avg_rating_less_than(max_rating=4)

     movie_id   rating       avg_rating
                   sum count           
0           1   2051.0   547   3.749543
1           2    516.0   145   3.558621
2           3   7326.0  2012   3.641153
3           4    389.0   142   2.739437
4           5   4468.0  1140   3.919298
...       ...      ...   ...        ...
4494     4495   2136.0   614   3.478827
4495     4496  35820.0  9519   3.763000
4496     4497   1939.0   714   2.715686
4497     4498    663.0   269   2.464684
4498     4499   1119.0   428   2.614486

[4244 rows x 4 columns]


In [77]:
count_movies_rated_by_customer = all_customers_get_movie_rated_count()
count_movies_rated_by_customer

Unnamed: 0,customer_id,movie_id
0,6,153
1,7,195
2,8,21
3,10,49
4,25,4
...,...,...
470753,2649404,12
470754,2649409,10
470755,2649421,3
470756,2649426,74


In [79]:
print("random user id used while in development\n", np.unique(data_rating['customer_id'])[14100:15000])

random user id used while in development
 [79602 79612 79613 79616 79625 79627 79633 79640 79643 79652 79655 79658
 79660 79663 79666 79682 79683 79686 79692 79695 79699 79703 79712 79713
 79721 79724 79730 79731 79732 79733 79735 79740 79745 79746 79747 79750
 79755 79756 79764 79770 79772 79773 79774 79782 79794 79799 79806 79807
 79809 79811 79812 79826 79835 79838 79839 79861 79867 79870 79880 79883
 79884 79890 79898 79900 79911 79917 79918 79922 79925 79926 79927 79940
 79944 79948 79954 79961 79967 79981 79986 79988 79989 79999 80003 80007
 80010 80014 80017 80022 80025 80038 80044 80055 80058 80060 80063 80064
 80065 80091 80102 80108 80112 80117 80123 80124 80127 80135 80138 80139
 80140 80142 80146 80148 80152 80155 80158 80160 80161 80163 80164 80165
 80176 80177 80178 80187 80191 80198 80209 80218 80229 80231 80233 80241
 80250 80259 80264 80266 80276 80277 80284 80286 80290 80295 80298 80304
 80306 80308 80311 80320 80325 80327 80339 80340 80346 80348 80351 80354
 80356 80

### Details about a specific customer

In [18]:
# ids I like -> 28812, 56514, 56520, 1488844
customer_id_use = 84100

In [19]:
#all_custumers_id_rows(customer_id=customer_id_use)
## same as line above but including movie_year and movie_title
all_customers_id_plus_movie_title_rows(customer_id=customer_id_use)

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
141847,30,84100,5.0,2003.0,Something's Gotta Give
757653,191,84100,4.0,2003.0,X2: X-Men United
1408832,299,84100,4.0,2001.0,Bridget Jones's Diary
1936216,357,84100,5.0,2003.0,House of Sand and Fog
2417368,457,84100,4.0,2004.0,Kill Bill: Vol. 2
3936782,758,84100,4.0,2004.0,Mean Girls
5510889,1110,84100,4.0,2003.0,Secondhand Lions
6259259,1220,84100,4.0,2004.0,Man on Fire
6680950,1307,84100,4.0,2003.0,S.W.A.T.
7777139,1561,84100,3.0,2003.0,American Wedding


In [20]:
get_users_avg_rating(customer_id=customer_id_use)

14877    3.954545
Name: avg_rating, dtype: float64

In [128]:
display_movies_customer_rated_higher_than(customer_id=customer_id_use, min_rating=4)

                     movie_title  rating
movie_id                                
30        Something's Gotta Give     5.0
191             X2: X-Men United     4.0
299        Bridget Jones's Diary     4.0
357        House of Sand and Fog     5.0
457            Kill Bill: Vol. 2     4.0
758                   Mean Girls     4.0
1110            Secondhand Lions     4.0
1220                 Man on Fire     4.0
1307                    S.W.A.T.     4.0
1962              50 First Dates     4.0
2580               Freaky Friday     4.0
3320                 About a Boy     4.0
3624            The Last Samurai     5.0
3756               About Schmidt     4.0
3860              Bruce Almighty     4.0
3925        The Matrix: Reloaded     4.0
3938                     Shrek 2     4.0
4472               Love Actually     4.0
average rating 14877    3.954545
Name: avg_rating, dtype: float64


In [130]:
display_movies_customer_rated_lower_than(customer_id=customer_id_use, max_rating=4)

                 movie_title  rating
movie_id                            
1561        American Wedding     3.0
2112                Identity     3.0
2391        Along Came Polly     3.0
2457      A Cinderella Story     3.0
average rating 14877    3.954545
Name: avg_rating, dtype: float64


In [133]:
get_users_loved_hated_movies(customer_id=customer_id_use, minmax_rating=4)

User 84100 loved these movies
Something's Gotta Give
X2: X-Men United
Bridget Jones's Diary
House of Sand and Fog
Kill Bill: Vol. 2
Mean Girls
Secondhand Lions
Man on Fire
S.W.A.T.
50 First Dates
Freaky Friday
About a Boy
The Last Samurai
About Schmidt
Bruce Almighty
The Matrix: Reloaded
Shrek 2
Love Actually

and disliked these movies
American Wedding
Identity
Along Came Polly
A Cinderella Story


## Recommendations with collaborative filtering

In [99]:
def build_training_set(df):
    reader = Reader()
    svd = SVD()
    data_new = Dataset.load_from_df(df[['customer_id', 'movie_id', 'rating']], reader)
    trainset = data_new.build_full_trainset()
    svd.fit(trainset)
    return trainset, svd

def get_summary(type="movie_id"):
    #>IF MOVIE:  movie_id, movie rating count, movie rating mean
    #>IF CUSTOMER: customer_id, custumer rating count, customer rating mean
    df_count_mean_summary = data_rating.groupby(type)['rating'].agg(['count', 'mean'])
    df_count_mean_summary.index = df_count_mean_summary.index.map(int)
    #>IF MOVIE: returns 1799.0 as a benchmark number
    #IF CUSTOMER: returns 52.0 as a benchmark number
    benchmark = round(df_count_mean_summary['count'].quantile(0.7),0)
    # drop all rows below benchmark
    df_drop_list = df_count_mean_summary[df_count_mean_summary['count'] < benchmark]
    # return all indexes to drop
    return df_drop_list

def get_customer_recommendations(customer_id=customer_id_use):
    #> returns movie_id, movie_year, movie_title
    chosen_customer_pred = data_movies.copy()
    # fails if movie_id is the index so we have to reset the index back to normal (0-N)
    chosen_customer_pred = chosen_customer_pred.reset_index()
    # makes sure that we only pick movies that are not in the movie dropped list
    chosen_customer_pred = chosen_customer_pred[~chosen_customer_pred['movie_id'].isin(df_movie_drop_list)]
    # make prediction for customer with id = <customer_id> and put it into 'estimated_score'
    chosen_customer_pred['estimated_score'] = chosen_customer_pred['movie_id'].apply(lambda x: svd.predict(customer_id, x).est)
    # sort by 'estimated score'
    chosen_customer_pred = chosen_customer_pred.sort_values('estimated_score', ascending=False).set_index('movie_id')
    return chosen_customer_pred

def display_rated_content(customer_id=customer_id_use, number_to_show=20):
    print("Movies/TV Shows rated by customer", customer_id)
    df = all_customers_id_plus_movie_title_rows(customer_id=customer_id)
    df = df[['movie_title', 'rating']].sort_values('rating', ascending=False)
    print(df[0:number_to_show].set_index('movie_title'))

def display_customers_recommendations(customer_id=customer_id_use, df=[], number_to_show=20):
    print("Movies/TV Shows recommended to customer")
    print(df[['movie_title', 'estimated_score']][0:number_to_show].set_index('movie_title'))

def display_history_plus_recommended(customer_id=customer_id_use, number_to_show=25):
    display_rated_content(customer_id=customer_id, number_to_show=number_to_show)
    chosen_customer_pred = get_customer_recommendations(customer_id)
    display_customers_recommendations(customer_id=customer_id, df=chosen_customer_pred, number_to_show=number_to_show)

def get_evaluation_and_svd(use_pickle=True):
    evaluationData_file_name = "evaluationData_build_training_set"
    svd_file_name = "svd_build_training_set"
    if(use_pickle == True):
        evaluationData = load_pickle(evaluationData_file_name)
        svd = load_pickle(svd_file_name)
        return evaluationData, svd
    else:
        evaluationData, svd = build_training_set(data_rating)
        save_to_pickle(evaluationData_file_name, evaluationData)
        save_to_pickle(svd_file_name, svd)
        return evaluationData, svd

In [83]:
evaluationData, svd = get_evaluation_and_svd(use_pickle=True)

In [84]:
## creating a list of the movies that we don't wanna include
df_movie_drop_list = get_summary(type="movie_id")
#df_customer_drop_list = get_summary(type="customer_id")

In [94]:
display_history_plus_recommended(customer_id=79724, number_to_show=10)

Movies/TV Shows rated by customer 79724
                                               rating
movie_title                                          
The X-Files: Season 9                             5.0
American Beauty                                   5.0
Alien: Collector's Edition                        5.0
Lord of the Rings: The Fellowship of the Ring     5.0
La Femme Nikita: Season 2                         5.0
Buffalo '66                                       5.0
Napoleon Dynamite                                 5.0
Dogma                                             5.0
28 Days Later                                     5.0
The Silence of the Lambs                          4.0
Movies/TV Shows recommended to customer
                                              estimated_score
movie_title                                                  
Eternal Sunshine of the Spotless Mind                5.000000
Cracker: Series 1                                    4.852637
American Beauty         

In [125]:
display_history_plus_recommended(customer_id=customer_id_use, number_to_show=10)

Movies/TV Shows rated by customer 84100
                        rating
movie_title                   
Something's Gotta Give     5.0
House of Sand and Fog      5.0
The Last Samurai           5.0
X2: X-Men United           4.0
Shrek 2                    4.0
The Matrix: Reloaded       4.0
Bruce Almighty             4.0
About Schmidt              4.0
About a Boy                4.0
Freaky Friday              4.0
Movies/TV Shows recommended to customer
                                                    estimated_score
movie_title                                                        
Lost: Season 1                                             5.000000
Curb Your Enthusiasm: Season 3                             5.000000
Foyle's War: Set 2                                         5.000000
The West Wing: Season 3                                    4.944963
Mail Call: The Best of Season 1                            4.936956
As Time Goes By: Series 8                                  4.930461
Alia

In [None]:
#TODO:
#for the new user problem
## find highest rated movie / tv shows that have been watched the most -> how likely is new user to click on it ?

## SOMETHING

In [139]:
from surprise import dataset
from surprise import KNNBaseline
from surprise.model_selection import train_test_split 
from surprise.model_selection import LeaveOneOut
from surprise import accuracy

In [140]:
def build_training_set(df):
    reader = Reader()
    svd = SVD()
    data_new = Dataset.load_from_df(df[['customer_id', 'movie_id', 'rating']], reader)
    trainset = data_new.build_full_trainset()
    svd.fit(trainset)
    return trainset, svd

def print_evaluation_accuracy(prediction):
    print("\nEvaluating accuracy of model...")
    print("RMSE: ", accuracy.rmse(prediction, verbose=False))
    print("MAE: ", accuracy.mae(prediction, verbose=False))

In [None]:
reader = Reader(line_format='user item rating', rating_scale=(1, 5))
class MyDataset(dataset.DatasetAutoFolds):
    def __init__(self, df, reader):
        self.raw_ratings = [(uid, iid, r, None) for (uid, iid, r) in zip(df['customer_id'], df['movie_id'], df['rating'])]
        self.reader=reader

data = MyDataset(data_rating, reader)
print("\nBuilding recommendation model...")
trainSet, testSet = train_test_split(data, test_size=.25, random_state=1)

In [141]:
algo = SVD(random_state=10)
algo.fit(trainSet)

print("\nComputing recommendations...")
predictions = algo.test(testSet)


Computing recommendations...

Evaluating accuracy of model...
RMSE:  0.9031234558172019
MAE:  0.6975635778170713


In [145]:
print_evaluation_accuracy(predictions)


Evaluating accuracy of model...
RMSE:  0.9031234558172019
MAE:  0.6975635778170713


In [146]:
print_evaluation_accuracy(svd.test(testSet))

In [142]:
# #TODO: keep or throw away since it's never used
# sim_options = {'name': 'pearson_baseline', 'user_based': False}
# simsAlgo = KNNBaseline(sim_options=sim_options)
# simsAlgo.fit(evaluationData)

In [None]:
print(algo)

In [None]:
print(svd)

<surprise.prediction_algorithms.matrix_factorization.SVD object at 0x7fae4ae74190>


In [None]:
#save_to_pickle(name="recommendation_predictions", df=predictions)