In [5]:
#!pip install surprise
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle
#sns.set_style("darkgrid")

## Callable functions

In [123]:
## if use_pickle = True then we don't have to create a new dataframe because it's already been done
## how ever if you add another csv file to df_all then you will have to set use_pickle = False to get the new data
def create_dataframe(use_pickle=True):
    if(use_pickle == True):
        in_pickle = open("pickle/movies_customers_ratings.pickle", "rb")
        data = pd.DataFrame(pickle.load(in_pickle), columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)
    else:
        index = 1
        last_movie_id = "1"
        new_data = []
        for customer_id in df_all["customer_id"]:
            # if we find : that means this is a movie_id and not customer_id
            if(customer_id.find(":") > 0):
                movie_id = customer_id.replace(":", "")
                last_movie_id = movie_id
            else:
                # we have this row index so use it to get rating
                rating = df_all["rating"][index-1]
                new_data.append([last_movie_id, customer_id, rating, index])
            index += 1
        #output to pickle file
        movies_customers_ratings = open("pickle/movies_customers_ratings.pickle","wb")
        pickle.dump(new_data, movies_customers_ratings)
        movies_customers_ratings.close()
        data = pd.DataFrame(new_data, columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)

    # change columns to numerical
    data['movie_id'] = data['movie_id'].astype(int)
    data['customer_id'] = data['customer_id'].astype(int)
    data["rating"] = data["rating"].astype(float)
    return data

def all_movie_id_rows(movie_id):
    return data_rating[data_rating['movie_id'] == movie_id]

def all_custumers_id_rows(customer_id):
    return data_rating[data_rating['customer_id'] == customer_id]

def all_customers_id_plus_movie_title_rows(customer_id):
    return data_rating_plus_movie_title[data_rating_plus_movie_title['customer_id'] == customer_id]

def all_movies_get_average_rating():
    movie_ratings_stats = data_rating.groupby('movie_id').agg({'rating': ['sum', 'count']}).reset_index()
    movie_ratings_stats['avg_rating'] =  movie_ratings_stats['rating']['sum'] / movie_ratings_stats['rating']['count']
    return movie_ratings_stats

def all_customers_get_average_rating():
    customers_ratings_stats = data_rating.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    return customers_ratings_stats

def all_movies_get_rated_count():
    return data_rating.groupby('movie_id').agg({'movie_id': 'count'}).reset_index()

def all_customers_get_movie_rated_count():
    return data_rating.groupby('customer_id').agg({'movie_id': 'count'}).reset_index()

def get_avg_rating_less_than(max_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] < max_rating])


def get_movie_avg_rating(movie_id):
    output = all_movies_average_rating[all_movies_average_rating['movie_id'] == movie_id]
    #print(output)
    return output['avg_rating']

def get_users_avg_rating(customer_id):
    output = all_customers_average_ratings[all_customers_average_ratings['customer_id'] == customer_id]
    #print(output)
    return output['avg_rating']

def get_movies_customer_rated_higher_than(customer_id, min_rating=0):
    temp = data_rating_plus_movie_title.copy()
    df_customer_liked = temp[(temp['customer_id'] == customer_id) & (temp['rating'] >= min_rating)].set_index('movie_id')
    print(df_customer_liked[['movie_title', 'rating']])
    print('average rating', get_users_avg_rating(customer_id))

## WORK IN PROGRESS
# def get_all_customer_rated_movie_higher_than(movie_id, min_rating):
#     temp = data_rating_plus_movie_title.copy()
#     df_movie_customers = temp[temp['movie_id'] == movie_id] & (temp['rating'] >= min_rating)
#     print(df_movie_customers['movie_title', 'customer_id', 'rating'])
#     print('movie average rating', get_movie_avg_rating(movie_id))

In [39]:
def save_to_pickle(name, df):
    path_name = "pickle/"+name+".pickle"
    pickle_file = open(path_name,"wb")
    pickle.dump(df, pickle_file)
    pickle_file.close()

def load_pickle(name):
    path_name = "pickle/"+name+".pickle"
    return_input = open(path_name, "rb")
    return pickle.load(return_input)

## Get the data

in these txt files the movie_id's have been placed amongst the customer_id's, which means that we have to split that column into two, we do this by taking all ratings that are equal to NaN 
( meaning that we are now refering to a movie and not a customer ) and from there we split it into two columns.

In [10]:
## import all combined_data files to one large pandas dataframe
##> returns index, customer_id (which is both movie and customer), rating (NaN = customer_id is a movie, Not NaN = customers rating)
df_all = pd.read_csv('./data/combined_data_1.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1])
#df_all = df_all.append(pd.read_csv('./data/combined_data_2.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
#df_all = df_all.append(pd.read_csv('./data/combined_data_3.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
#df_all = df_all.append(pd.read_csv('./data/combined_data_4.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
df_all.index = np.arange(0,len(df_all))
df_all['rating'] = df_all['rating'].astype(float)

## ------------------------------------------------------------------------------------- ##

# dataframe containing all informations about the movies
#> returns movie_id, movie_year, movie_title
data_movies = pd.read_csv('./data/movie_titles.csv', header = None, names = ['movie_id', 'movie_year', 'movie_title'], usecols = [0,1,2], encoding="latin1")
data_movies.set_index('movie_id', inplace = True)

## ------------------------------------------------------------------------------------- ##

# dataframe containing all informations about the movie ratings by customer
#> returns index, movie_id, customer_id, rating
data_rating = create_dataframe(use_pickle=True)

## ------------------------------------------------------------------------------------- ##

# combine customer ratings to movie titles
#> returns index, movie_id, customer_id, rating, movie_year, movie_title
data_rating_plus_movie_title = data_rating.merge(data_movies, on="movie_id", how="inner")
data_rating_plus_movie_title

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
0,1,1488844,3.0,2003.0,Dinosaur Planet
1,1,822109,5.0,2003.0,Dinosaur Planet
2,1,885013,4.0,2003.0,Dinosaur Planet
3,1,30878,4.0,2003.0,Dinosaur Planet
4,1,823519,3.0,2003.0,Dinosaur Planet
...,...,...,...,...,...
24053759,4499,2591364,2.0,2002.0,In My Skin
24053760,4499,1791000,2.0,2002.0,In My Skin
24053761,4499,512536,5.0,2002.0,In My Skin
24053762,4499,988963,3.0,2002.0,In My Skin


# Working with the data

### Details about all customers/movies

In [11]:
# get the average movie rating for all customers
# used to determine if this user typically gives bad or good reviews
# and then we can see if he really hates or loves a movie
all_customers_average_ratings = all_customers_get_average_rating()
print(all_customers_average_ratings)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
1                7  793.0   195   4.066667
2                8   84.0    21   4.000000
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
...            ...    ...   ...        ...
470753     2649404   49.0    12   4.083333
470754     2649409   40.0    10   4.000000
470755     2649421   15.0     3   5.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[470758 rows x 4 columns]


In [12]:
get_avg_rating_less_than(5)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
1                7  793.0   195   4.066667
2                8   84.0    21   4.000000
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
...            ...    ...   ...        ...
470752     2649401  118.0    30   3.933333
470753     2649404   49.0    12   4.083333
470754     2649409   40.0    10   4.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[459896 rows x 4 columns]


In [13]:
get_avg_rating_less_than(4)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
5               33   38.0    11   3.454545
6               42  142.0    36   3.944444
...            ...    ...   ...        ...
470747     2649376  133.0    34   3.911765
470748     2649378  233.0    72   3.236111
470750     2649384   23.0     7   3.285714
470751     2649388  257.0    77   3.337662
470752     2649401  118.0    30   3.933333

[331845 rows x 4 columns]


In [14]:
count_movies_rated_by_customer = all_customers_get_movie_rated_count()
#count_movies_rated_by_customer

Unnamed: 0,customer_id,movie_id
0,6,153
1,7,195
2,8,21
3,10,49
4,25,4
...,...,...
470753,2649404,12
470754,2649409,10
470755,2649421,3
470756,2649426,74


In [15]:
#all_movies_average_rating = all_movies_get_average_rating()
#get_all_customer_rated_movie_higher_than(1, 4)

In [120]:
print("random user id used while in development\n", np.unique(data_rating['customer_id'])[14100:15000])

random user id used while in development
 [79602 79612 79613 79616 79625 79627 79633 79640 79643 79652 79655 79658
 79660 79663 79666 79682 79683 79686 79692 79695 79699 79703 79712 79713
 79721 79724 79730 79731 79732 79733 79735 79740 79745 79746 79747 79750
 79755 79756 79764 79770 79772 79773 79774 79782 79794 79799 79806 79807
 79809 79811 79812 79826 79835 79838 79839 79861 79867 79870 79880 79883
 79884 79890 79898 79900 79911 79917 79918 79922 79925 79926 79927 79940
 79944 79948 79954 79961 79967 79981 79986 79988 79989 79999 80003 80007
 80010 80014 80017 80022 80025 80038 80044 80055 80058 80060 80063 80064
 80065 80091 80102 80108 80112 80117 80123 80124 80127 80135 80138 80139
 80140 80142 80146 80148 80152 80155 80158 80160 80161 80163 80164 80165
 80176 80177 80178 80187 80191 80198 80209 80218 80229 80231 80233 80241
 80250 80259 80264 80266 80276 80277 80284 80286 80290 80295 80298 80304
 80306 80308 80311 80320 80325 80327 80339 80340 80346 80348 80351 80354
 80356 80

### Details about a specific customer

In [17]:
# ids I like -> 28812, 56514, 56520, 1488844
customer_id_use = 84100

In [124]:
#all_custumers_id_rows(customer_id=customer_id_use)
## same as line above but including movie_year and movie_title
all_customers_id_plus_movie_title_rows(customer_id=customer_id_use)

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
141847,30,84100,5.0,2003.0,Something's Gotta Give
757653,191,84100,4.0,2003.0,X2: X-Men United
1408832,299,84100,4.0,2001.0,Bridget Jones's Diary
1936216,357,84100,5.0,2003.0,House of Sand and Fog
2417368,457,84100,4.0,2004.0,Kill Bill: Vol. 2
3936782,758,84100,4.0,2004.0,Mean Girls
5510889,1110,84100,4.0,2003.0,Secondhand Lions
6259259,1220,84100,4.0,2004.0,Man on Fire
6680950,1307,84100,4.0,2003.0,S.W.A.T.
7777139,1561,84100,3.0,2003.0,American Wedding


In [19]:
get_users_avg_rating(customer_id=customer_id_use)

14877    3.954545
Name: avg_rating, dtype: float64

In [20]:
get_movies_customer_rated_higher_than(customer_id_use, 4)

                     movie_title  rating
movie_id                                
30        Something's Gotta Give     5.0
191             X2: X-Men United     4.0
299        Bridget Jones's Diary     4.0
357        House of Sand and Fog     5.0
457            Kill Bill: Vol. 2     4.0
758                   Mean Girls     4.0
1110            Secondhand Lions     4.0
1220                 Man on Fire     4.0
1307                    S.W.A.T.     4.0
1962              50 First Dates     4.0
2580               Freaky Friday     4.0
3320                 About a Boy     4.0
3624            The Last Samurai     5.0
3756               About Schmidt     4.0
3860              Bruce Almighty     4.0
3925        The Matrix: Reloaded     4.0
3938                     Shrek 2     4.0
4472               Love Actually     4.0
average rating 14877    3.954545
Name: avg_rating, dtype: float64


# Recommendations with collaborative filtering

In [107]:
def build_training_set(df):
    reader = Reader()
    svd = SVD()
    data_new = Dataset.load_from_df(df[['customer_id', 'movie_id', 'rating']], reader)
    trainset = data_new.build_full_trainset()
    svd.fit(trainset)
    return svd

def get_summary(id):
    #>IF MOVIE:  movie_id, movie rating count, movie rating mean
    #>IF CUSTOMER: customer_id, custumer rating count, customer rating mean
    df_count_mean_summary = data_rating.groupby(id)['rating'].agg(['count', 'mean'])
    df_count_mean_summary.index = df_count_mean_summary.index.map(int)
    #>IF MOVIE: returns 1799.0 as a benchmark number
    #IF CUSTOMER: returns 52.0 as a benchmark number
    benchmark = round(df_count_mean_summary['count'].quantile(0.7),0)
    # drop all rows below benchmark
    df_drop_list = df_count_mean_summary[df_count_mean_summary['count'] < benchmark]
    # return all indexes to drop
    return df_drop_list

def get_customer_recommendations(customer_id):
    #> returns movie_id, movie_year, movie_title
    chosen_customer_pred = data_movies.copy()
    # fails if movie_id is the index so we have to reset the index back to normal (0-N)
    chosen_customer_pred = chosen_customer_pred.reset_index()
    # makes sure that we only pick movies that are not in the movie dropped list
    chosen_customer_pred = chosen_customer_pred[~chosen_customer_pred['movie_id'].isin(df_movie_drop_list)]
    # make prediction for customer with id = <customer_id> and put it into 'estimated_score'
    chosen_customer_pred['estimated_score'] = chosen_customer_pred['movie_id'].apply(lambda x: svd.predict(customer_id, x).est)
    # sort by 'estimated score'
    chosen_customer_pred = chosen_customer_pred.sort_values('estimated_score', ascending=False).set_index('movie_id')
    return chosen_customer_pred

def display_rated_content(customer_id=customer_id_use):
    print("Movies/TV Shows rated by customer", customer_id)
    df = all_customers_id_plus_movie_title_rows(customer_id=customer_id)
    df = df[['movie_title', 'rating']].sort_values('rating', ascending=False)
    print(df.set_index('movie_title'))

def display_customers_recommendations(customer_id=customer_id_use, df=[], number_to_show=20):
    print("Movies/TV Shows recommended to customer")
    print(df[['movie_title', 'estimated_score']][0:number_to_show].set_index('movie_title'))

In [64]:
## create a list of the movies that we don't wanna include
df_movie_drop_list = get_summary("movie_id")
#df_customer_drop_list = get_summary("customer_id")

In [125]:
customer_id_to_use = 79724 #customer_id_use 
display_rated_content(customer_id=customer_id_to_use)

Movies/TV Shows rated by customer 79724
                                               rating
movie_title                                          
The X-Files: Season 9                             5.0
American Beauty                                   5.0
Alien: Collector's Edition                        5.0
Lord of the Rings: The Fellowship of the Ring     5.0
La Femme Nikita: Season 2                         5.0
Buffalo '66                                       5.0
Napoleon Dynamite                                 5.0
Dogma                                             5.0
28 Days Later                                     5.0
The Silence of the Lambs                          4.0
Braveheart                                        4.0
The Rocky Horror Picture Show                     4.0
Aliens: Collector's Edition                       4.0
Primer                                            4.0
Reservoir Dogs                                    4.0
What the #$*! Do We Know!?                

In [122]:
chosen_customer_pred = get_customer_recommendations(customer_id_to_use)
display_customers_recommendations(customer_id=customer_id_to_use, df=chosen_customer_pred, number_to_show=25)

Movies/TV Shows recommended to customer
                                              estimated_score
movie_title                                                  
Eternal Sunshine of the Spotless Mind                5.000000
Being John Malkovich                                 4.857555
American Beauty                                      4.789360
Invader Zim                                          4.723880
Sideways                                             4.586447
Yes                                                  4.585005
Napoleon Dynamite                                    4.554650
Wallace & Gromit in Three Amazing Adventures         4.474002
Garden State                                         4.466051
The Simpsons: Season 3                               4.454095
The Wire: Season 1                                   4.443778
Firefly                                              4.431002
This Is Spinal Tap                                   4.426263
Bowling for Columbine         

In [None]:
#for the new user problem
## find highest rated movie / tv shows that have been watched the most -> how likely is new user to click on it ?