In [39]:
#!pip install surprise
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle
#sns.set_style("darkgrid")

## Callable functions

In [40]:
def all_movie_id_rows(movie_id):
    return data_rating[data_rating['movie_id'] == movie_id]

def all_custumers_id_rows(customer_id):
    return data_rating[data_rating['customer_id'] == customer_id]

def all_customers_get_average_rating():
    customers_ratings_stats = data_rating.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    return customers_ratings_stats

def get_avg_rating_less_than(max_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] < max_rating])

def get_users_avg_rating(customer_id):
    output = all_customers_average_ratings[all_customers_average_ratings['customer_id'] == customer_id]
    #print(output)
    return output['avg_rating']

## Get the data

in these txt files the movie_id's have been placed amongst the customer_id's, which means that we have to split that column into two, we do this by taking all ratings that are equal to NaN 
( meaning that we are now refering to a movie and not a customer ) and from there we split it into two columns.

In [41]:
## import all combined_data files to one large pandas dataframe
##> returns index, customer_id (which is both movie and customer), rating (NaN = customer_id is a movie, Not NaN = customers rating)
df_all = pd.read_csv('./data/combined_data_1.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1])
#df_all = df_all.append(pd.read_csv('./data/combined_data_2.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
#df_all = df_all.append(pd.read_csv('./data/combined_data_3.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
#df_all = df_all.append(pd.read_csv('./data/combined_data_4.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
df_all.index = np.arange(0,len(df_all))
df_all['rating'] = df_all['rating'].astype(float)

## ------------------------------------------------------------------------------------- ##

# dataframe containing all informations about the movies
#> returns movie_id, movie_year, movie_title
df_movies = pd.read_csv('./data/movie_titles.csv', header = None, names = ['movie_id', 'movie_year', 'movie_title'], usecols = [0,1,2], encoding="latin1")
df_movies.set_index('movie_id', inplace = True)
df_movies

Unnamed: 0_level_0,movie_year,movie_title
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW
...,...,...
17766,2002.0,Where the Wild Things Are and Other Maurice Se...
17767,2004.0,Fidel Castro: American Experience
17768,2000.0,Epoch
17769,2003.0,The Company


In [42]:
data_rating

Unnamed: 0,movie_id,customer_id,rating
0,1,1488844,3.0
1,1,822109,5.0
2,1,885013,4.0
3,1,30878,4.0
4,1,823519,3.0
...,...,...,...
24053759,4499,2591364,2.0
24053760,4499,1791000,2.0
24053761,4499,512536,5.0
24053762,4499,988963,3.0


In [43]:
data_rating[data_rating["customer_id"] == 1488844].join(df_movies, on="movie_id")

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
0,1,1488844,3.0,2003.0,Dinosaur Planet
5149,8,1488844,4.0,2004.0,What the #$*! Do We Know!?
24352,17,1488844,2.0,2005.0,7 Seconds
93266,30,1488844,3.0,2003.0,Something's Gotta Give
224744,44,1488844,3.0,1996.0,Spitfire Grill
...,...,...,...,...,...
23949455,4479,1488844,5.0,1980.0,Ordinary People
23977591,4485,1488844,2.0,1953.0,House of Wax / The Mystery of the Wax Museum
23981209,4488,1488844,5.0,2000.0,Wonder Boys
24015690,4490,1488844,3.0,2004.0,Ned Kelly


In [44]:
## if use_pickle = True then we don't have to create a new dataframe because it's already been done
## how ever if you add another csv file to df_all then you will have to set use_pickle = False to get the new data
def create_dataframe(use_pickle=True):
    if(use_pickle == True):
        in_pickle = open("pickle/movies_customers_ratings.pickle", "rb")
        data = pd.DataFrame(pickle.load(in_pickle), columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)
    else:
        index = 1
        last_movie_id = "1"
        new_data = []
        for customer_id in df_all["customer_id"]:
            # if we find : that means this is a movie_id and not customer_id
            if(customer_id.find(":") > 0):
                movie_id = customer_id.replace(":", "")
                last_movie_id = movie_id
            else:
                # we have this row index so use it to get rating
                rating = df_all["rating"][index-1]
                new_data.append([last_movie_id, customer_id, rating, index])
            index += 1
        #output to pickle file
        movies_customers_ratings = open("pickle/movies_customers_ratings.pickle","wb")
        pickle.dump(new_data, movies_customers_ratings)
        movies_customers_ratings.close()
        data = pd.DataFrame(new_data, columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)

    # change columns to numerical
    data['movie_id'] = data['movie_id'].astype(int)
    data['customer_id'] = data['customer_id'].astype(int)
    data["rating"] = data["rating"].astype(float)
    return data

#> returns index, movie_id, customer_id, rating
data_rating = create_dataframe(use_pickle=True)

In [45]:
# combine customer ratings to movie titles
#> returns index, movie_id, customer_id, rating, movie_year, movie_title
data_rating_plus_movie_title = data_rating.merge(df_movies, on="movie_id", how="inner")
data_rating_plus_movie_title

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
0,1,1488844,3.0,2003.0,Dinosaur Planet
1,1,822109,5.0,2003.0,Dinosaur Planet
2,1,885013,4.0,2003.0,Dinosaur Planet
3,1,30878,4.0,2003.0,Dinosaur Planet
4,1,823519,3.0,2003.0,Dinosaur Planet
...,...,...,...,...,...
24053759,4499,2591364,2.0,2002.0,In My Skin
24053760,4499,1791000,2.0,2002.0,In My Skin
24053761,4499,512536,5.0,2002.0,In My Skin
24053762,4499,988963,3.0,2002.0,In My Skin


## Working with the data

In [46]:
# Show movie rated for a specific customer
all_custumers_id_rows(customer_id=6)

Unnamed: 0,movie_id,customer_id,rating
187297,30,6,3.0
539827,157,6,3.0
576723,173,6,4.0
649632,175,6,5.0
795442,191,6,2.0
...,...,...,...
23266008,4356,6,4.0
23526258,4393,6,3.0
23586501,4406,6,3.0
23769931,4432,6,3.0


In [47]:
# get the average movie rating for all customers
# used to determine if this user typically gives bad or good reviews
# and then we can see if he really hates or loves a movie
all_customers_average_ratings = all_customers_get_average_rating()
print(all_customers_average_ratings)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
1                7  793.0   195   4.066667
2                8   84.0    21   4.000000
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
...            ...    ...   ...        ...
470753     2649404   49.0    12   4.083333
470754     2649409   40.0    10   4.000000
470755     2649421   15.0     3   5.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[470758 rows x 4 columns]


# Recommendations with collaborative filtering

In [48]:
count_movies_rated_by_customer = data_rating.groupby('customer_id').agg({'movie_id': 'count'}).reset_index()

In [49]:
count_movies_rated_by_customer

Unnamed: 0,customer_id,movie_id
0,6,153
1,7,195
2,8,21
3,10,49
4,25,4
...,...,...
470753,2649404,12
470754,2649409,10
470755,2649421,3
470756,2649426,74


In [50]:
all_custumers_id_rows(25)

Unnamed: 0,movie_id,customer_id,rating
668785,178,25,3.0
4024590,761,25,4.0
17901555,3427,25,2.0
23685758,4432,25,5.0


# Average rating stats

In [51]:
get_users_avg_rating(25)

4    3.5
Name: avg_rating, dtype: float64

In [52]:
get_avg_rating_less_than(5)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
1                7  793.0   195   4.066667
2                8   84.0    21   4.000000
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
...            ...    ...   ...        ...
470752     2649401  118.0    30   3.933333
470753     2649404   49.0    12   4.083333
470754     2649409   40.0    10   4.000000
470756     2649426  301.0    74   4.067568
470757     2649429  258.0    62   4.161290

[459896 rows x 4 columns]


In [53]:
get_avg_rating_less_than(4)

       customer_id rating       avg_rating
                      sum count           
0                6  510.0   153   3.333333
3               10  164.0    49   3.346939
4               25   14.0     4   3.500000
5               33   38.0    11   3.454545
6               42  142.0    36   3.944444
...            ...    ...   ...        ...
470747     2649376  133.0    34   3.911765
470748     2649378  233.0    72   3.236111
470750     2649384   23.0     7   3.285714
470751     2649388  257.0    77   3.337662
470752     2649401  118.0    30   3.933333

[331845 rows x 4 columns]


In [59]:
get_users_avg_rating(2649429)

470757    4.16129
Name: avg_rating, dtype: float64

In [37]:
movies_customers_ratings = open("pickle/small_movies_customers_ratings.pickle","wb")
pickle.dump(data_rating[0:10000], movies_customers_ratings)
movies_customers_ratings.close()

In [60]:
data_rating_plus_movie_title[data_rating_plus_movie_title['customer_id'] == 2648907]

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
8621267,1719,2648907,1.0,2004.0,The Life Aquatic with Steve Zissou
11592374,2200,2648907,2.0,2002.0,Collateral Damage
16953480,3282,2648907,5.0,2004.0,Sideways
17920476,3427,2648907,1.0,2002.0,Men in Black II
19147859,3638,2648907,1.0,2003.0,Bad Boys II
21832672,4123,2648907,1.0,1998.0,Patch Adams


In [None]:
def get_movies_liked_by_customer(customer_id, min_rating=0):
    df_customer_liked = data_rating[(data_rating['customer_id'] == customer_id) & (data_rating['rating'] >= min_rating)]
    df_customer_liked = df_customer_liked.set_index('movie_id')
    df_customer_liked = df_customer_liked.join(df_movies, on="movie_id")[['movie_title', 'rating']]
    print(df_customer_liked)
    print("")
    average_rating = get_users_avg_rating(customer_id)
    print('avg rating', average_rating)
    return df_customer_liked
df_customer_1488844 = get_movies_liked_by_customer(1488844, 5)

In [None]:
df_movie_summary = data.groupby('movie_id')['rating'].agg(['count', 'mean'])
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

In [None]:
customer_1488844_pred = df_movies.copy()
customer_1488844_pred = df_customer_1488844.reset_index()
customer_1488844_pred = customer_1488844_pred[~customer_1488844_pred['movie_id'].isin(drop_movie_list)]

In [None]:
reader = Reader()
svd = SVD()

data_new = Dataset.load_from_df(data[['customer_id', 'movie_id', 'rating']], reader)
trainset = data_new.build_full_trainset()
svd.fit(trainset)

In [None]:
customer_1488844_pred['estimated_score'] = customer_1488844_pred['movie_id'].apply(lambda x: svd.predict(1488844, x).est)
customer_1488844_pred = customer_1488844_pred.set_index('movie_id')
customer_1488844_pred = customer_1488844_pred.sort_values('estimated_score', ascending=False)

In [None]:

print(customer_1488844_pred[['movie_title', 'rating', 'estimated_score']].set_index('movie_title'))

In [None]:
#for the new user problem
## find highest rated movie / tv shows that have been watched the most -> how likely is new user to click on it ?
