In [33]:
import pandas as pd
import os
import json


# Reads the json with all the clusters for each user
path = os.getcwd()[:os.getcwd().find("Code")] + "/Data/user-clusters/clusters.json"
with open(path, "r") as s:
    clusters = json.loads(s.read())

# Gets the cluster of users for user 729846
similar_users_to_729846 = [int(user) for user in clusters["729846"].split()] + [729846]

# Reads the downsampled dataframe and gets the data for the users in user 729846 cluster, including itself
df = pd.read_csv(os.getcwd()[:os.getcwd().find("Code")] + "/Data/netflix-prize/downsampled-csv/few_samples.csv", index_col=0)
df_729846 = df[df["user_id"].isin(similar_users_to_729846)]
print(df_729846.shape)
print(df[df["user_id"].isin([int(user) for user in clusters["729846"].split()])].shape)

# Example of how to get the DataFrame to be augmented with IMDb data 
# (add features for each movie) and then to be used to train our models
movie_ids = []
movies_avg_rating = []
user_729846_ratings = []
for movie_id in df_729846["movie_id"].unique():
    movie_ids.append(movie_id)
    movies_avg_rating.append(df_729846[df_729846["movie_id"] == movie_id]["rating"].mean())
    try:
        user_729846_ratings.append(df_729846[(df_729846["movie_id"] == movie_id) & (df_729846["user_id"] == 729846)]["rating"].iloc[0])
    except IndexError:
        user_729846_ratings.append("?")
    
df_729846 = pd.DataFrame({"movie_id": movie_ids, "cluster_avg_rating": movies_avg_rating, "user_729846_rating": user_729846_ratings})
print(df_729846.head())
print(df_729846["user_729846_rating"].value_counts())

(5068, 3)
(4424, 3)
(0, 3)


  result = method(y)


   movie_id  cluster_avg_rating user_729846_rating
0         1            4.000000                  4
1         8            5.000000                  ?
2        17            4.000000                  ?
3        28            4.500000                  5
4        30            4.666667                  ?
?    1747
4     256
5     220
3     117
2      32
1      19
Name: user_729846_rating, dtype: int64


In [29]:
import numpy as np

df_729846.replace("?", np.NaN, inplace=True)
df_729846.dropna(inplace=True)
print(df_729846["user_729846_rating"].value_counts())

4.0    256
5.0    220
3.0    117
2.0     32
1.0     19
Name: user_729846_rating, dtype: int64


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X = df_729846.drop("user_729846_rating", axis=1)
y = df_729846["user_729846_rating"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.8502632013835584

In [31]:
lr.score(X_train, y_train)

0.8757437943876125

## Doing this for all the users

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
import os
import json
from time import time

# Reads the downsampled dataframe
df = pd.read_csv(os.getcwd()[:os.getcwd().find("Code")] + "/Data/netflix-prize/downsampled-csv/few_samples.csv", index_col=0)
# Reads the json with all the clusters for each user
path = os.getcwd()[:os.getcwd().find("Code")] + "/Data/user-clusters/clusters.json"
with open(path, "r") as s:
    clusters = json.loads(s.read())


In [27]:
"729846" in df["user_id"]

False

In [2]:
# Trains and scores a basic linear regression model for each user
lr = LinearRegression()
r_squared = {}
user_count = 0
problem_users = []
timer = 0
for user in clusters.keys():
    
    start = time()

    # Gets the cluster of users for user
    similar_users_to_user = [int(user) for user in clusters[user].split()]
    # Gets the data for the users in user cluster
    df_user_cluster = df[df["user_id"].isin(similar_users_to_user)]
    df_user = df[df["user_id"].isin([user])]

    # mprint("a", time() - start)

    # start = time()
    
    groups = df_user_cluster.groupby("movie_id")
    movies_avg_rating = groups["rating"].mean().values
    movies_avg_rating = groups["rating"].mean()#.values
    df_user = pd.merge(movies_avg_rating, df_user, on="movie_id")
    del df_user["user_id"]
    df_user.columns = ["movie_id", "cluster_avg_rating", "user_rating"]
    
    df_user.replace("?", np.NaN, inplace=True)
    df_user.dropna(inplace=True)
    try:
        if min(df_user["user_rating"].value_counts()) == 1:  # Condition to avoid error on train_test_split
            # ValueError: The least populated class in y has only 1 member, which is too few. 
            # The minimum number of groups for any class cannot be less than 2.
            problem_users.append(user)
        else:
            X = df_user.drop("user_rating", axis=1)
            y = df_user["user_rating"]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
            lr.fit(X_train, y_train)
            r_squared[user] = lr.score(X_test, y_test)
    except:
        print("error:", user)

    timer += time() - start
    user_count += 1
    if user_count % 100 == 0:
        print("We have trained and scored models on", user_count, "users,", 
             "and it has taken", round(timer), "seconds")

We have trained and scored models on 100 users, and it has taken 9 seconds
We have trained and scored models on 200 users, and it has taken 15 seconds
We have trained and scored models on 300 users, and it has taken 22 seconds
error: 1569028
We have trained and scored models on 400 users, and it has taken 27 seconds
We have trained and scored models on 500 users, and it has taken 32 seconds
We have trained and scored models on 600 users, and it has taken 37 seconds
We have trained and scored models on 700 users, and it has taken 42 seconds
We have trained and scored models on 800 users, and it has taken 47 seconds
We have trained and scored models on 900 users, and it has taken 51 seconds
We have trained and scored models on 1000 users, and it has taken 57 seconds
We have trained and scored models on 1100 users, and it has taken 61 seconds
We have trained and scored models on 1200 users, and it has taken 66 seconds
We have trained and scored models on 1300 users, and it has taken 71 se

In [5]:
np.mean(list(r_squared.values()))

0.4777365567179733

In [6]:
import operator
sorted_r_squared = sorted(r_squared.items(), key=operator.itemgetter(1), reverse=True)

In [7]:
sorted_r_squared[0]

('1915847', 1.0)

In [16]:
sorted_r_squared[100]

('1516635', 0.8236819966430549)

In [21]:
sorted_r_squared[200]

('1392040', 0.7477141110932599)

In [8]:
sorted_r_squared[-1]

('262823', -1.4826263035137188)

In [10]:
sorted_r_squared[-4]

('2096898', -0.18986049201666177)

In [12]:
df[df["user_id"] == 262823].shape

(32, 3)

In [13]:
df[df["user_id"] == 2096898].shape

(35, 3)

In [14]:
sorted_r_squared[-10]

('59510', -0.020923608297436447)

In [15]:
df[df["user_id"] == 59510].shape

(433, 3)