In [1]:
## INSTALL IF YOU DON'T HAVE SURPRISE ON YOUR MACHINE AND WOULD LIKE TO RUN THE CODE
#!pip install surprise
#!pip install import-ipynb

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import heapq
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise import dataset, KNNBaseline, accuracy
from surprise.model_selection import cross_validate
import pickle
from random import *
from surprise import accuracy
from collections import defaultdict
from surprise import KNNBasic
from collections import defaultdict
from operator import itemgetter
import NetflixLoadData as NetflixLoadData

use_pickle_file = True
reader = Reader(line_format='user item rating', rating_scale=(1, 5))

def save_to_pickle(name, df):
    path_name = "pickle/"+name+".pickle"
    pickle_file = open(path_name,"wb")
    pickle.dump(df, pickle_file)
    pickle_file.close()

def load_pickle(name):
    path_name = "pickle/"+name+".pickle"
    return_input = open(path_name, "rb")
    return pickle.load(return_input)

# IF THIS IS YOUR FIRST RUN [uncomment next line, and change to how many files you want to work with (choices are 1 to 4) from original kaggle netflix price dataset (less is faster)]
#NetflixLoadData.first_time_running(max_files=4)

how_many_files_do_you_want_to_use = 2 #choices are 1 to 4
movie_titles, ratings, movie_and_rating = NetflixLoadData.get_data(number_of_files_of_rating=how_many_files_do_you_want_to_use)

### Functions

In [None]:
def all_id_rows(df, type, item_id):
    return df[df[type] == item_id]

def all_average_ratings(df, type='movie_id'):
    ratings_stats = df.groupby(type).agg({'rating': ['sum', 'count']}).reset_index()
    ratings_stats['avg_rating'] =  ratings_stats['rating']['sum'] / ratings_stats['rating']['count']
    return ratings_stats

def customer_average_ratings(df, type='customer_id', customer_id=0):
    ratings_stats = df[df[type]==customer_id].groupby(type).agg({'rating': ['sum', 'count']}).reset_index()
    ratings_stats['avg_rating'] =  ratings_stats['rating']['sum'] / ratings_stats['rating']['count']
    return ratings_stats

def all_get_rated_count(df, type):
    return df.groupby(type).agg({'movie_id': 'count'}).reset_index()

def get_avg_rating_less_than(df, max_rating):
    return df[df['avg_rating'] < max_rating]

def get_avg_rating_higher_than(df, min_rating):
    return df[df['avg_rating'] > min_rating]

def get_item_avg_rating(df, type, item_id):
    return df[df[type] == item_id]


def get_movies_customer_rated_higher_than(df, customer_id, min_rating=4): 
    #df is equal to data_rating_plus_movie_title
    temp = df.copy()
    return temp[(temp['customer_id'] == customer_id) & (temp['rating'] >= min_rating)].set_index('movie_id')

def get_movies_customer_rated_lower_than(df, customer_id, max_rating=4):
    #df is equal to data_rating_plus_movie_title
    temp = df.copy()
    return temp[(temp['customer_id'] == customer_id) & (temp['rating'] < max_rating)].set_index('movie_id')


def display_movies_customer_rated_higher_than(df, customer_id, min_rating=4):
    #df is equal to data_rating_plus_movie_title
    df_customer_liked = get_movies_customer_rated_higher_than(df=df, customer_id=customer_id, min_rating=min_rating)
    print(df_customer_liked[['movie_title', 'rating']])
    customers_ratings_stats = df.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    print('average rating', customers_ratings_stats[customers_ratings_stats['customer_id'] == customer_id]['avg_rating'])

def display_movies_customer_rated_lower_than(df, customer_id, max_rating=4):
    #df is equal to data_rating_plus_movie_title
    df_customer_disliked = get_movies_customer_rated_lower_than(df, customer_id=customer_id, max_rating=max_rating)
    print(df_customer_disliked[['movie_title', 'rating']])
    customers_ratings_stats = df.groupby('customer_id').agg({'rating': ['sum', 'count']}).reset_index()
    customers_ratings_stats['avg_rating'] =  customers_ratings_stats['rating']['sum'] / customers_ratings_stats['rating']['count']
    print('average rating', customers_ratings_stats[customers_ratings_stats['customer_id'] == customer_id]['avg_rating'])


def get_users_loved_hated_movies(df, customer_id, minmax_rating):
    users_ratings_higher_than_four = get_movies_customer_rated_higher_than(df=df, customer_id=customer_id, min_rating=minmax_rating)
    users_ratings_lower_than_four = get_movies_customer_rated_lower_than(df=df, customer_id=customer_id, max_rating=minmax_rating)
    print("User", customer_id ,"loved these movies")
    for rating in users_ratings_higher_than_four['movie_title']:
        print(rating)
    print('')
    print("and disliked these movies")
    for rating in users_ratings_lower_than_four['movie_title']:
        print(rating)

#def get_customers_who_rated_movie_title(df=data_rating_plus_movie_title, movie_title=""):
def get_customers_who_rated_movie_title(df, movie_title=""):
    return df[df["movie_title"] == movie_title]

#def get_avg_rating_for_movie_title(df=data_movies, movie_title=""):
def get_avg_rating_for_movie_title(df, movie_title=""):
    tmp_data_movies = df[df["movie_title"] == movie_title]
    tmp_movie_id = tmp_data_movies['movie_id'].values[0]
    tmp_data_movies.columns = pd.MultiIndex.from_product([['movie_id'], tmp_data_movies.columns])
    all_movies_average_rating = all_average_ratings(df=data_rating, type='movie_id')
    return get_item_avg_rating(df=all_movies_average_rating , type='movie_id', item_id=tmp_movie_id).join(tmp_data_movies)

### Analyzing data

In [2]:
movie_and_rating

Unnamed: 0,movie_id,customer_id,rating,movie_year,movie_title
0,1,1488844,3,2003.0,Dinosaur Planet
1,1,822109,5,2003.0,Dinosaur Planet
2,1,885013,4,2003.0,Dinosaur Planet
3,1,30878,4,2003.0,Dinosaur Planet
4,1,823519,3,2003.0,Dinosaur Planet
...,...,...,...,...,...
51031350,9210,2420260,1,2002.0,Beijing Bicycle
51031351,9210,761176,3,2002.0,Beijing Bicycle
51031352,9210,459277,3,2002.0,Beijing Bicycle
51031353,9210,2407365,4,2002.0,Beijing Bicycle


### Working with the data