In [None]:
#!pip install surprise
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import pickle
#sns.set_style("darkgrid")

## Callable functions

In [None]:
def all_movie_id_rows(movie_id):
    return data_rating[data_rating['movie_id'] == movie_id]

def all_custumers_id_rows(customer_id):
    return data_rating[data_rating['customer_id'] == customer_id]

def all_customers_id_plus_movie_title_rows(customer_id):
    return data_rating_plus_movie_title[data_rating_plus_movie_title['customer_id'] == customer_id_use]

def all_customers_get_average_rating():
    customers_ratings_stats = data_rating.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    return customers_ratings_stats

def all_customers_get_movie_rated_count():
    return data_rating.groupby('customer_id').agg({'movie_id': 'count'}).reset_index()

def get_avg_rating_less_than(max_rating):
    print(all_customers_average_ratings[all_customers_average_ratings['avg_rating'] < max_rating])

def get_users_avg_rating(customer_id):
    output = all_customers_average_ratings[all_customers_average_ratings['customer_id'] == customer_id]
    #print(output)
    return output['avg_rating']



## Get the data

In [None]:
## if use_pickle = True then we don't have to create a new dataframe because it's already been done
## how ever if you add another csv file to df_all then you will have to set use_pickle = False to get the new data
def create_dataframe(use_pickle=True):
    if(use_pickle == True):
        in_pickle = open("pickle/movies_customers_ratings.pickle", "rb")
        data = pd.DataFrame(pickle.load(in_pickle), columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)
    else:
        index = 1
        last_movie_id = "1"
        new_data = []
        for customer_id in df_all["customer_id"]:
            # if we find : that means this is a movie_id and not customer_id
            if(customer_id.find(":") > 0):
                movie_id = customer_id.replace(":", "")
                last_movie_id = movie_id
            else:
                # we have this row index so use it to get rating
                rating = df_all["rating"][index-1]
                new_data.append([last_movie_id, customer_id, rating, index])
            index += 1
        #output to pickle file
        movies_customers_ratings = open("pickle/movies_customers_ratings.pickle","wb")
        pickle.dump(new_data, movies_customers_ratings)
        movies_customers_ratings.close()
        data = pd.DataFrame(new_data, columns=['movie_id', 'customer_id', 'rating', 'index']).drop(['index'], axis=1)

    # change columns to numerical
    data['movie_id'] = data['movie_id'].astype(int)
    data['customer_id'] = data['customer_id'].astype(int)
    data["rating"] = data["rating"].astype(float)
    return data


in these txt files the movie_id's have been placed amongst the customer_id's, which means that we have to split that column into two, we do this by taking all ratings that are equal to NaN 
( meaning that we are now refering to a movie and not a customer ) and from there we split it into two columns.

In [None]:
## import all combined_data files to one large pandas dataframe
##> returns index, customer_id (which is both movie and customer), rating (NaN = customer_id is a movie, Not NaN = customers rating)
df_all = pd.read_csv('./data/combined_data_1.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1])
#df_all = df_all.append(pd.read_csv('./data/combined_data_2.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
#df_all = df_all.append(pd.read_csv('./data/combined_data_3.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
#df_all = df_all.append(pd.read_csv('./data/combined_data_4.txt', header = None, names = ['customer_id', 'rating'], usecols = [0,1]))
df_all.index = np.arange(0,len(df_all))
df_all['rating'] = df_all['rating'].astype(float)

## ------------------------------------------------------------------------------------- ##

# dataframe containing all informations about the movies
#> returns movie_id, movie_year, movie_title
data_movies = pd.read_csv('./data/movie_titles.csv', header = None, names = ['movie_id', 'movie_year', 'movie_title'], usecols = [0,1,2], encoding="latin1")
data_movies.set_index('movie_id', inplace = True)

## ------------------------------------------------------------------------------------- ##

# dataframe containing all informations about the movie ratings by customer
#> returns index, movie_id, customer_id, rating
data_rating = create_dataframe(use_pickle=True)

## ------------------------------------------------------------------------------------- ##

# combine customer ratings to movie titles
#> returns index, movie_id, customer_id, rating, movie_year, movie_title
data_rating_plus_movie_title = data_rating.merge(data_movies, on="movie_id", how="inner")
data_rating_plus_movie_title

## Working with the data

### Details about all customers

In [None]:
# get the average movie rating for all customers
# used to determine if this user typically gives bad or good reviews
# and then we can see if he really hates or loves a movie
all_customers_average_ratings = all_customers_get_average_rating()
print(all_customers_average_ratings)

In [None]:
get_avg_rating_less_than(5)

In [None]:
get_avg_rating_less_than(4)

In [None]:
count_movies_rated_by_customer = all_customers_get_movie_rated_count()
count_movies_rated_by_customer

In [None]:
print(np.unique(data_rating['customer_id'])[9000:14000])

### Details about a specific customer

In [None]:
# ids I like -> 28812, 56514, 56520
customer_id_use = 28812

In [None]:
#all_custumers_id_rows(customer_id=customer_id_use)
## same as line above but including movie_year and movie_title
all_customers_id_plus_movie_title_rows(customer_id=customer_id_use)

In [None]:
get_users_avg_rating(customer_id=customer_id_use)

# Recommendations with collaborative filtering

In [None]:
# movies_customers_ratings = open("pickle/small_movies_customers_ratings.pickle","wb")
# pickle.dump(data_rating[0:10000], movies_customers_ratings)
# movies_customers_ratings.close()

In [None]:
def get_movies_liked_by_customer(customer_id, min_rating=0):
    df_customer_liked = data_rating[(data_rating['customer_id'] == customer_id) & (data_rating['rating'] >= min_rating)]
    df_customer_liked = df_customer_liked.set_index('movie_id')
    df_customer_liked = df_customer_liked.join(data_movies, on="movie_id")[['movie_title', 'rating']]
    print(df_customer_liked)
    print("")
    average_rating = get_users_avg_rating(customer_id)
    print('avg rating', average_rating)
    return df_customer_liked
df_customer_1488844 = get_movies_liked_by_customer(1488844, 5)

In [None]:
df_customer_1488844

In [None]:
df_movie_summary = data.groupby('movie_id')['rating'].agg(['count', 'mean'])
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

In [None]:
customer_1488844_pred = data_movies.copy()
customer_1488844_pred = df_customer_1488844.reset_index()
customer_1488844_pred = customer_1488844_pred[~customer_1488844_pred['movie_id'].isin(drop_movie_list)]

In [None]:
reader = Reader()
svd = SVD()

data_new = Dataset.load_from_df(data[['customer_id', 'movie_id', 'rating']], reader)
trainset = data_new.build_full_trainset()
svd.fit(trainset)

In [None]:
customer_1488844_pred['estimated_score'] = customer_1488844_pred['movie_id'].apply(lambda x: svd.predict(1488844, x).est)
customer_1488844_pred = customer_1488844_pred.set_index('movie_id')
customer_1488844_pred = customer_1488844_pred.sort_values('estimated_score', ascending=False)

In [None]:

print(customer_1488844_pred[['movie_title', 'rating', 'estimated_score']].set_index('movie_title'))

In [None]:
#for the new user problem
## find highest rated movie / tv shows that have been watched the most -> how likely is new user to click on it ?
