In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from time import time
import os
import json
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Reads the downsampled dataframe
df = pd.read_csv(os.getcwd()[:os.getcwd().find("Code")] + "/Data/netflix-prize/downsampled-csv/few_samples.csv")
# Reads the json with all the clusters for each user
path = os.getcwd()[:os.getcwd().find("Code")] + "/Data/user-clusters/clusters.json"
with open(path, "r") as s:
    clusters = json.loads(s.read())

def get_predictions_for_user(user):
    
    lr = LinearRegression()
    # Gets the cluster of users for user
    similar_users_to_user = [int(user) for user in clusters[user].split()]
    # Gets the data for the users in user cluster
    df_user_cluster = df[df["user_id"].isin(similar_users_to_user)]
    df_user = df[df["user_id"].isin([user])]
    
    groups = df_user_cluster.groupby("movie_id")
    movies_avg_rating = groups["rating"].mean()
    df_user = pd.merge(movies_avg_rating, df_user, on="movie_id")
    del df_user["user_id"]
    df_user.columns = ["movie_id", "cluster_avg_rating", "user_rating"]
    
    df_user.replace("?", np.NaN, inplace=True)
    df_user.dropna(inplace=True)
    try:
        if min(df_user["user_rating"].value_counts()) == 1:  # Condition to avoid error on train_test_split
            # ValueError: The least populated class in y has only 1 member, which is too few. 
            # The minimum number of groups for any class cannot be less than 2.
            pass
        else:
            X = df_user.drop(["user_rating"], axis=1)
            y = df_user["user_rating"]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
            X_train.drop("movie_id", axis=1, inplace=True), X_test.drop("movie_id", axis=1, inplace=True)
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
            r_squared = lr.score(X_test, y_test)
    except:
        print("error:", user)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    predictions = pd.DataFrame(X_test["movie_id"])
    predictions["user_predicted_score"] = y_pred
    return lr, predictions, r_squared

lr, y_pred, r_squared = get_predictions_for_user("915")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [11]:
y_pred.head()

Unnamed: 0,movie_id,user_predicted_score
13,8743,4.96437
2,694,5.162567
6,4429,3.067344
14,8928,3.162581
11,7184,4.15237


In [12]:
r_squared

0.6172653677144122

In [13]:
df_915 = df[df["user_id"] == 915]
movie_id = df_915[df_915["rating"] == 5].iloc[5]["movie_id"]

In [15]:
path = os.getcwd()[:os.getcwd().find("Code")] + "Data"
def default_progress_handler(percentage):
    print('parsing metadata: ' + str(percentage))
def load_from_txt(data_dir, progress_handler=default_progress_handler):
    """
    Function to metadata provided by netflix
    :param data_dir: path to the Data directory
    :param progress_handler: function responsible for feeding progress updates back to gui
    :return: pandas dataframe with movie metadata
    """
    path = os.path.join(data_dir, "netflix-prize")
    acc = pd.DataFrame(columns=['id', 'year', 'title'])
    num_movies = 17770
    progress_step = int(num_movies*0.01)
    movie_count = 0
    with open(os.path.join(path, 'movie_titles.csv'), "r", encoding = "ISO-8859-1") as s:
        line = s.readline().strip()
        while line:
            movie_count += 1
            if movie_count % progress_step == 0:
                progress_handler(movie_count/num_movies*100)
            tokens = line.split(",")
            acc = acc.append(
                {'id': tokens[0], #movie id
                'year': tokens[1], #year
                'title': ','.join(tokens[2:])},
            ignore_index=True) # movie_title
            line = s.readline().strip()
    progress_handler(100)

    return acc
# movie_titles = load_from_txt(path)

parsing metadata: 0.996060776589758
parsing metadata: 1.992121553179516
parsing metadata: 2.9881823297692742
parsing metadata: 3.984243106359032
parsing metadata: 4.980303882948791
parsing metadata: 5.9763646595385485
parsing metadata: 6.972425436128306
parsing metadata: 7.968486212718064
parsing metadata: 8.964546989307824
parsing metadata: 9.960607765897581
parsing metadata: 10.95666854248734
parsing metadata: 11.952729319077097
parsing metadata: 12.948790095666855
parsing metadata: 13.944850872256612
parsing metadata: 14.94091164884637
parsing metadata: 15.936972425436128
parsing metadata: 16.933033202025886
parsing metadata: 17.929093978615647
parsing metadata: 18.925154755205405
parsing metadata: 19.921215531795163
parsing metadata: 20.91727630838492
parsing metadata: 21.91333708497468
parsing metadata: 22.909397861564436
parsing metadata: 23.905458638154194
parsing metadata: 24.90151941474395
parsing metadata: 25.89758019133371
parsing metadata: 26.893640967923467
parsing metadat

In [16]:
title = movie_titles[movie_titles["id"] == str(movie_id)]["title"].iloc[0]
title

'Ice Age'

In [17]:
path = os.getcwd()[:os.getcwd().find("Code")]
path += "aaron-gauthier-individual-project/Code/export_data_netflix_full_movies.csv"
data = pd.read_csv(path)

movie_piv = data.pivot_table(index = 'user_id', columns = 'movie_title', values = 'rating')
movie_user_ratings = movie_piv[title]

similar_to_movie = movie_piv.corrwith(movie_user_ratings)
corr_movie = pd.DataFrame(similar_to_movie, columns = ['Correlation'])
corr_movie.dropna(inplace = True)

movie_rating_counts = pd.DataFrame(data.groupby('movie_title')['rating'].mean())
movie_rating_counts['number_ratings'] = pd.DataFrame(data.groupby('movie_title')['rating'].count())

corr_movie.sort_values('Correlation', ascending = False).head(10)
corr_movie = corr_movie.join(movie_rating_counts['number_ratings'])
corr_movie[corr_movie['number_ratings']>100].sort_values('Correlation', ascending = False).head()

  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Correlation,number_ratings
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Ice Age,1.0,432
Look Who's Talking Now,0.596281,101
The Lion King 1 1/2,0.570257,105
Bewitched,0.539448,108
When a Man Loves a Woman,0.526263,105


In [18]:
mov_id = int(movie_titles[movie_titles["title"] == "The Lion King 1 1/2"]["id"].iloc[0])

Let's see if the user has actually watched an rated this movie

In [20]:
df_915[df_915["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_id,rating


It has not. Let's see if we have a predicted score for it

In [21]:
y_pred[y_pred["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_predicted_score


Nop, let's try with another movie

In [23]:
mov_id = int(movie_titles[movie_titles["title"] == "Look Who's Talking Now"]["id"].iloc[0])
df_915[df_915["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_id,rating


In [24]:
y_pred[y_pred["movie_id"] == mov_id]

Unnamed: 0,movie_id,user_predicted_score


Let's use a movie we have predictions for instead and see how it compares

In [29]:
y_pred.iloc[8]

movie_id                7381.000000
user_predicted_score       3.067344
Name: 12, dtype: float64

In [30]:
title2 = movie_titles[movie_titles["id"] == str(7381)]["title"].iloc[0]
corr_movie.loc[title2]

Correlation         0.338701
number_ratings    146.000000
Name: Flubber, dtype: float64

As we can see, the predicted score our user would give to `Flubber` is about 3 stars, while the correlation with `Ice Age`, a movie our user gave 5 stars, is 0.338701. This shows a moderate agreement between the two approaches.