# CS 109a Recommendations

## PROJECT INFO

#### Team Members
Maciej Holubiec, Jimena Romero Pinto, Paul von Chamier

---

In [5]:
import matplotlib.pyplot as plt
import datetime as dt
import pandas as pd
import numpy as np
import json
from sklearn.linear_model import Ridge
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso

# LOAD DATA

*** Only for the first time. For later uses skip to the "RELOAD DATA" part. Make sure to download smaller (prepocessed) datasets before running it ***

In [None]:
# LOAD USER
df_user = pd.read_json("user.json", lines = True)

In [None]:
# LOAD BUSINESS
df_business = pd.read_json("business.json", lines = True)

In [None]:
# LOAD REVIEWS
with open('review.json', encoding="utf8") as json_file:      
    data_review = json_file.readlines()
    # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
    data_review = list(map(json.loads, data_review)) 

df_review = pd.DataFrame(data_review)

# PROCESS

### Select restaurants with more than 30 reviews

In [None]:
df_business.shape

In [None]:
df_business = df_business.drop(["hours", "is_open", "latitude", "longitude", "postal_code", "neighborhood", "attributes"], axis = 1)

In [None]:
df_business_350 = df_business[df_business["review_count"] > 350]

In [None]:
df_business_350.shape

### Select users who gave more than 100 reviews

In [None]:
df_users.shape

In [None]:
df_user = df_user[["user_id", "review_count"]]

In [None]:
df_users_150 = df_user[df_user["review_count"] > 150]

In [None]:
df_users_150.shape

### Filter out reviews to those corresponding to selected users and restaurants

In [None]:
df_review.shape

In [None]:
df_review = df_review.drop(["cool", "date", "funny", "review_id", "text", "useful"], axis = 1)

In [24]:
df_review_350_150 = df_review[df_review["user_id"].isin(df_users_150["user_id"])]

In [27]:
df_review_350_150 = df_review_350_150[df_review_350_150["business_id"].isin(df_business_350["business_id"])]

### Save data for future reference so we deal with smaller files

In [None]:
df_users_150.to_json("df_user_150.json")

In [None]:
df_business_350.to_json("df_business_350.json")

In [30]:
df_review_350_150.to_json("df_review_350_150.json")

# RELOAD DATA

In [82]:
df_user = pd.read_json("df_user_100.json")

In [81]:
df_business = pd.read_json("df_business_30.json")

In [6]:
df_review = pd.read_json("df_review_350_150.json")

### Sample from the data frame because the dataset is still too big

In [7]:
np.random.seed(9001)
fraction_of_df = 0.15

In [8]:
df_review_smaller = df_review.sample(frac=fraction_of_df)

In [9]:
df_review_smaller.shape

(38198, 3)

In [10]:
df_review_smaller.to_json("df_review_smaller.json")

In [2]:
df_review_smaller = pd.read_json("df_review_smaller.json")

### Create latent matrix

In [11]:
r_df = df_review_smaller.pivot(index = 'user_id', columns ='business_id', values = 'stars')
r_df.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3zffZUHoY8bQjGfPSoBKQ,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,-7H-oXvCxJzuT42ky6Db0g,-95mbLJsa0CxXhpaNL4LvA,-9dmhyBvepc08KPEHlEM0w,...,zcScEL0WEdFkROcnz5379g,zdE82PiD6wquvjYLyhOJNA,zgQHtqX0gqMw1nlBZl2VnQ,zlpLjbwrKuNs8zROgB_qUQ,znWHLW1pt19HzW1VY6KfCA,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,,,,,,,,,,,...,,,,,,,,,,
--2vR0DIsmQ6WfcSzKWigw,,,,,,,,,,,...,,,,,,,,,,
--4q8EyqThydQm-eKZpS-A,,,,,,,,,,,...,,,,,,,,,,
--56mD0sm1eOogphi2FFLw,,,,,,,,,,,...,,,,,,,,,,
--CIuK7sUpaNzalLAlHJKA,,,,,,,,,,,...,,,,,,,,,,


In [12]:
r_df.shape

(15455, 1523)

In [13]:
fill_zero_rf = r_df.fillna(0)

In [14]:
fill_zero_rf.shape

(15455, 1523)

In [15]:
fill_zero_rf.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3zffZUHoY8bQjGfPSoBKQ,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,-7H-oXvCxJzuT42ky6Db0g,-95mbLJsa0CxXhpaNL4LvA,-9dmhyBvepc08KPEHlEM0w,...,zcScEL0WEdFkROcnz5379g,zdE82PiD6wquvjYLyhOJNA,zgQHtqX0gqMw1nlBZl2VnQ,zlpLjbwrKuNs8zROgB_qUQ,znWHLW1pt19HzW1VY6KfCA,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--2vR0DIsmQ6WfcSzKWigw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4q8EyqThydQm-eKZpS-A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--56mD0sm1eOogphi2FFLw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--CIuK7sUpaNzalLAlHJKA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# MODELS

### Define RMSE error functions

In [16]:
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

In [17]:
def rmse2(model, x, y):
    predict = model.predict(x)
    mse2 = rmse(y, predict)
    return mse2

### Baseline Averages

In [18]:
avg_mean = r_df.mean().mean()
avg_mean

3.8393172911341806

In [19]:
rows_length = r_df.shape[0]
cols_length = r_df.shape[1]

In [20]:
cols_means = r_df.mean(axis = 0)
rows_means = r_df.mean(axis = 1)

In [21]:
cols_means.head()

business_id
--9e1ONYQuAa-CB_Rrw7Tw    4.060606
-050d_XIor1NpCuWkbIVaQ    3.771429
-1xuC540Nycht_iWFeJ-dw    4.333333
-2ToCaDFpTNmmg3QFzxcWg    1.625000
-3zffZUHoY8bQjGfPSoBKQ    4.027778
dtype: float64

In [22]:
rows_means.head()

user_id
---1lKK3aKOuomHnwAkAow    4.5
--2vR0DIsmQ6WfcSzKWigw    4.5
--4q8EyqThydQm-eKZpS-A    3.0
--56mD0sm1eOogphi2FFLw    4.0
--CIuK7sUpaNzalLAlHJKA    3.0
dtype: float64

In [23]:
preds_array_avg = np.fromfunction(lambda i, j: rows_means[i] + cols_means[j] - avg_mean, (rows_length, cols_length), dtype=int)

In [24]:
preds_array_avg

array([[ 4.72128877,  4.43211128,  4.99401604, ...,  4.52734938,
         4.31782557,  4.89145194],
       [ 4.72128877,  4.43211128,  4.99401604, ...,  4.52734938,
         4.31782557,  4.89145194],
       [ 3.22128877,  2.93211128,  3.49401604, ...,  3.02734938,
         2.81782557,  3.39145194],
       ..., 
       [ 2.72128877,  2.43211128,  2.99401604, ...,  2.52734938,
         2.31782557,  2.89145194],
       [ 2.22128877,  1.93211128,  2.49401604, ...,  2.02734938,
         1.81782557,  2.39145194],
       [ 4.22128877,  3.93211128,  4.49401604, ...,  4.02734938,
         3.81782557,  4.39145194]])

In [25]:
avg_preds_df = pd.DataFrame(preds_array_avg, columns = r_df.columns, index = r_df.index)

In [26]:
avg_preds_df.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3zffZUHoY8bQjGfPSoBKQ,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,-7H-oXvCxJzuT42ky6Db0g,-95mbLJsa0CxXhpaNL4LvA,-9dmhyBvepc08KPEHlEM0w,...,zcScEL0WEdFkROcnz5379g,zdE82PiD6wquvjYLyhOJNA,zgQHtqX0gqMw1nlBZl2VnQ,zlpLjbwrKuNs8zROgB_qUQ,znWHLW1pt19HzW1VY6KfCA,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,4.721289,4.432111,4.994016,2.285683,4.68846,3.771794,4.271794,4.478865,4.115228,4.535683,...,4.374968,4.732111,3.860683,3.73963,4.131271,5.182422,4.994016,4.527349,4.317826,4.891452
--2vR0DIsmQ6WfcSzKWigw,4.721289,4.432111,4.994016,2.285683,4.68846,3.771794,4.271794,4.478865,4.115228,4.535683,...,4.374968,4.732111,3.860683,3.73963,4.131271,5.182422,4.994016,4.527349,4.317826,4.891452
--4q8EyqThydQm-eKZpS-A,3.221289,2.932111,3.494016,0.785683,3.18846,2.271794,2.771794,2.978865,2.615228,3.035683,...,2.874968,3.232111,2.360683,2.23963,2.631271,3.682422,3.494016,3.027349,2.817826,3.391452
--56mD0sm1eOogphi2FFLw,4.221289,3.932111,4.494016,1.785683,4.18846,3.271794,3.771794,3.978865,3.615228,4.035683,...,3.874968,4.232111,3.360683,3.23963,3.631271,4.682422,4.494016,4.027349,3.817826,4.391452
--CIuK7sUpaNzalLAlHJKA,3.221289,2.932111,3.494016,0.785683,3.18846,2.271794,2.771794,2.978865,2.615228,3.035683,...,2.874968,3.232111,2.360683,2.23963,2.631271,3.682422,3.494016,3.027349,2.817826,3.391452


In [28]:
# avg_preds_df.to_json("avg_preds_df_0.15.json")

### Baseline Regression

In [29]:
categorical_columns = ['business_id', 'user_id']

In [30]:
unique_business = df_review_smaller.business_id.nunique()
unique_business

1523

In [31]:
unique_user = df_review_smaller.user_id.nunique()
unique_user

15455

In [32]:
df_review_dummies = pd.get_dummies(df_review_smaller, columns=categorical_columns, drop_first=False)

In [33]:
df_review_dummies.shape

(38198, 16979)

In [34]:
# df_review_dummies.to_json("df_review_dummies.json")

In [35]:
# df_review_smaller = pd.read_json("df_review_smaller.json")

In [36]:
np.random.seed(9001)
msk = np.random.rand(len(df_review_dummies)) < 0.5

# data_train = df_subset[msk]
# data_test = df_subset[~msk]

x_train = df_review_dummies[msk].drop(['stars'], axis=1) # DataFrame

x_test = df_review_dummies[~msk].drop(['stars'], axis=1) # DataFrame

y_train = df_review_dummies[msk].stars #series

y_test = df_review_dummies[~msk].stars # series

In [37]:
ols_lasso = Lasso(alpha=0.0001)
ols_lasso.fit(x_train,y_train)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [38]:
rmse2(ols_lasso, x_train, y_train)

0.86079140172462165

In [39]:
rmse2(ols_lasso, x_test, y_test)

0.97445307396512448

In [40]:
y_preds = ols_lasso.predict(x_train)

In [41]:
ols_lasso.coef_

array([ 0.29526438,  0.        ,  0.08687282, ..., -0.31021108,
       -0.        , -0.        ])

In [42]:
busienss_coeffs = ols_lasso.coef_[:unique_business]

In [43]:
user_coeffs = ols_lasso.coef_[unique_business:]

In [44]:
user_coeffs.shape, busienss_coeffs.shape

((15455,), (1523,))

In [18]:
# train_residuals = y_train - y_preds

In [20]:
# train_residuals.head()

1000283   -0.850750
100126    -0.429319
1001936   -0.850750
1002945   -0.850750
1002957    0.149250
Name: stars, dtype: float64

In [45]:
preds_array_reg = np.fromfunction(lambda i, j: user_coeffs[i] + busienss_coeffs[j] + avg_mean, (rows_length, cols_length), dtype=int)

In [46]:
preds_array_reg

array([[ 4.13458167,  3.83931729,  3.92619012, ...,  3.83931729,
         3.71403969,  3.83931729],
       [ 4.13458167,  3.83931729,  3.92619012, ...,  3.83931729,
         3.71403969,  3.83931729],
       [ 4.13458167,  3.83931729,  3.92619012, ...,  3.83931729,
         3.71403969,  3.83931729],
       ..., 
       [ 3.82437059,  3.52910621,  3.61597903, ...,  3.52910621,
         3.40382861,  3.52910621],
       [ 4.13458167,  3.83931729,  3.92619012, ...,  3.83931729,
         3.71403969,  3.83931729],
       [ 4.13458167,  3.83931729,  3.92619012, ...,  3.83931729,
         3.71403969,  3.83931729]])

In [47]:
preds_array_reg_df = pd.DataFrame(preds_array_reg, columns = r_df.columns, index = r_df.index)

In [48]:
preds_array_reg_df.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3zffZUHoY8bQjGfPSoBKQ,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,-7H-oXvCxJzuT42ky6Db0g,-95mbLJsa0CxXhpaNL4LvA,-9dmhyBvepc08KPEHlEM0w,...,zcScEL0WEdFkROcnz5379g,zdE82PiD6wquvjYLyhOJNA,zgQHtqX0gqMw1nlBZl2VnQ,zlpLjbwrKuNs8zROgB_qUQ,znWHLW1pt19HzW1VY6KfCA,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,4.134582,3.839317,3.92619,1.540696,3.928585,2.874029,3.839317,3.839317,3.839317,3.760319,...,3.574515,3.904549,3.254626,2.938128,3.766569,3.977102,3.925678,3.839317,3.71404,3.839317
--2vR0DIsmQ6WfcSzKWigw,4.134582,3.839317,3.92619,1.540696,3.928585,2.874029,3.839317,3.839317,3.839317,3.760319,...,3.574515,3.904549,3.254626,2.938128,3.766569,3.977102,3.925678,3.839317,3.71404,3.839317
--4q8EyqThydQm-eKZpS-A,4.134582,3.839317,3.92619,1.540696,3.928585,2.874029,3.839317,3.839317,3.839317,3.760319,...,3.574515,3.904549,3.254626,2.938128,3.766569,3.977102,3.925678,3.839317,3.71404,3.839317
--56mD0sm1eOogphi2FFLw,4.134582,3.839317,3.92619,1.540696,3.928585,2.874029,3.839317,3.839317,3.839317,3.760319,...,3.574515,3.904549,3.254626,2.938128,3.766569,3.977102,3.925678,3.839317,3.71404,3.839317
--CIuK7sUpaNzalLAlHJKA,4.134582,3.839317,3.92619,1.540696,3.928585,2.874029,3.839317,3.839317,3.839317,3.760319,...,3.574515,3.904549,3.254626,2.938128,3.766569,3.977102,3.925678,3.839317,3.71404,3.839317


In [53]:
resids_array = np.subtract(r_df, preds_array_reg)

In [58]:
resids_df = resids_array.fillna(0)

In [59]:
resids_df.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3zffZUHoY8bQjGfPSoBKQ,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,-7H-oXvCxJzuT42ky6Db0g,-95mbLJsa0CxXhpaNL4LvA,-9dmhyBvepc08KPEHlEM0w,...,zcScEL0WEdFkROcnz5379g,zdE82PiD6wquvjYLyhOJNA,zgQHtqX0gqMw1nlBZl2VnQ,zlpLjbwrKuNs8zROgB_qUQ,znWHLW1pt19HzW1VY6KfCA,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--2vR0DIsmQ6WfcSzKWigw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--4q8EyqThydQm-eKZpS-A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--56mD0sm1eOogphi2FFLw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--CIuK7sUpaNzalLAlHJKA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Matrix Factorization

Code taken from https://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/

In [61]:
Q = resids_df.values

In [62]:
W = Q>0.5
W[W == True] = 1
W[W == False] = 0
# To be consistent with our Q matrix
W = W.astype(np.float64, copy=False)

In [63]:
W.shape

(15455, 1523)

In [72]:
lambda_ = 0.1
n_factors = 20
m, n = Q.shape
n_iterations = 10

In [73]:
X = 5 * np.random.rand(m, n_factors) 
Y = 5 * np.random.rand(n_factors, n)

In [74]:
def get_error(Q, X, Y, W):
    return np.sum((W * (Q - np.dot(X, Y)))**2)

In [75]:
errors = []
for ii in range(n_iterations):
    X = np.linalg.solve(np.dot(Y, Y.T) + lambda_ * np.eye(n_factors), 
                        np.dot(Y, Q.T)).T
    Y = np.linalg.solve(np.dot(X.T, X) + lambda_ * np.eye(n_factors),
                        np.dot(X.T, Q))
    errors.append(get_error(Q, X, Y, W))
Q_hat = np.dot(X, Y)
print('Error of rated movies: {}'.format(get_error(Q, X, Y, W)))

Error of rated movies: 12955.384191909738


In [77]:
fac_preds_df = pd.DataFrame(Q_hat, columns = r_df.columns, index = r_df.index)

In [78]:
fac_preds_df.head()

business_id,--9e1ONYQuAa-CB_Rrw7Tw,-050d_XIor1NpCuWkbIVaQ,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3zffZUHoY8bQjGfPSoBKQ,-6h3K1hj0d4DRcZNUtHDuw,-6tvduBzjLI1ISfs3F_qTg,-7H-oXvCxJzuT42ky6Db0g,-95mbLJsa0CxXhpaNL4LvA,-9dmhyBvepc08KPEHlEM0w,...,zcScEL0WEdFkROcnz5379g,zdE82PiD6wquvjYLyhOJNA,zgQHtqX0gqMw1nlBZl2VnQ,zlpLjbwrKuNs8zROgB_qUQ,znWHLW1pt19HzW1VY6KfCA,zoODlH40edpJYLPLkHilNA,zpoZ6WyQUYff18-z4ZU1mA,zrDi4gEaUi64lAMfJU51dw,zrTGcb83AsfyVTMrsCa65A,zwNC-Ow4eIMan2__bS9-rg
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
---1lKK3aKOuomHnwAkAow,-0.001057,0.001861,-0.0004930082,-0.0004918077,-0.00793,-0.002952,0.000315,-0.000522,-7.3e-05,0.000288,...,0.002016,0.009009,-0.000488,-0.001545,0.000669,0.004804,0.000167,0.005785,-0.001261,-0.006878
--2vR0DIsmQ6WfcSzKWigw,-0.004634,-8.4e-05,-0.0002785871,0.0001090502,0.007119,-0.000503,-3.2e-05,-2.3e-05,-0.000135,-9.7e-05,...,0.000377,0.000877,0.000114,5.2e-05,-6.3e-05,0.000586,-0.005376,-0.001118,0.000532,-0.000522
--4q8EyqThydQm-eKZpS-A,0.015085,-0.000305,0.0002351535,-3.563541e-05,-0.000531,-0.000158,-0.000294,6.7e-05,-0.000652,0.000113,...,-0.000833,-0.000261,-5e-05,-0.001531,-0.004251,-0.000271,-8.1e-05,-0.000432,0.000208,0.000258
--56mD0sm1eOogphi2FFLw,5e-06,-2e-06,-3.324232e-07,7.451144e-07,4e-06,-1e-06,2e-06,1e-06,-2e-06,1e-06,...,1.1e-05,3e-05,2e-06,-2.5e-05,7e-06,-8e-06,3e-06,6e-06,4e-06,-3e-06
--CIuK7sUpaNzalLAlHJKA,-0.000144,-5.8e-05,0.0003799868,-2.089563e-05,3.2e-05,5.3e-05,-1.5e-05,8e-06,-3.5e-05,1.7e-05,...,-0.000534,-8.4e-05,-1.5e-05,-0.000206,6.9e-05,-0.000407,-1.8e-05,7.2e-05,7.2e-05,8.2e-05


# PREDICT

In [83]:
def get_reccomendations(user, number_rec,df):
    top_preds = df.loc[user][fill_zero_rf.loc[user] == 0].sort_values(ascending = False)[:number_rec]
    top_preds_df = pd.DataFrame(top_preds).rename(columns={user:"predicted rating"})
    predictions = pd.merge(left = top_preds_df, right = df_business, left_index = True, right_on="business_id")
#     top_preds_df.join(df_business, on="business_id")
    return predictions
#     return top_preds_df

##### Let's predict for user ---1lKK3aKOuomHnwAkAow. Let's check what are his top choices:

In [105]:
top_ratings_user_x = df_review[df_review["user_id"] == "---1lKK3aKOuomHnwAkAow"].sort_values("stars", ascending = False)[:10]["business_id"]

In [106]:
df_business[df_business["business_id"].isin(top_ratings_user_x)]

Unnamed: 0,address,business_id,categories,city,name,review_count,stars,state
104700,"750 S Rampart Blvd, Ste 7",RRw9I8pHt5PzgYGT2QeODw,"[Pizza, Restaurants]",Las Vegas,Grimaldi's Pizzeria,431,4.0,NV
110934,113 N 4th St,eJKnymd0BywNPrJw1IuXVw,"[Breakfast & Brunch, Mexican, Restaurants, Ame...",Las Vegas,Nacho Daddy Downtown,723,4.0,NV
142630,"3555 S Town Center Dr, Ste 105",bPcqucuuClxYrIM8xWoArg,"[Italian, Wine Bars, Restaurants, Nightlife, B...",Las Vegas,Due Forni,446,4.0,NV
14551,"750 S Rampart Blvd, Ste 9",rq5dgoksPHkJwJNQKlGQ7w,"[Food, Coffee & Tea, Breakfast & Brunch, Cafes...",Las Vegas,Sambalatte Torrefazione,752,4.0,NV
32230,440 S Rampart Blvd,igHYkXZMLAc9UdV5VnR_AA,"[Steakhouses, Restaurants]",Las Vegas,Echo & Rig,1665,4.5,NV
40479,"10100 W Charleston Blvd, Ste 150",qmymSqVwHYRqdwfcBatzpQ,"[American (New), Restaurants, Sandwiches, Bars...",Las Vegas,Vintner Grill,571,4.0,NV
78134,8975 S Eastern Ave,p5rpYtxS5xPQjt3MXYVEwA,"[Vegetarian, Restaurants, Burgers, Vegan, Amer...",Las Vegas,Greens and Proteins,600,4.0,NV
84520,"The Mirage Hotel Casino, 3400 Las Vegas Blvd S",mz9ltimeAIy2c2qf5ctljw,"[Arts & Entertainment, Performing Arts]",Las Vegas,Cirque du Soleil - The Beatles LOVE,1766,4.5,NV
92918,"953 E Sahara Ave, Ste A5",KskYqH1Bi7Z_61pH6Om8pg,"[Automotive, Car Dealers, Restaurants, Thai, N...",Las Vegas,Lotus of Siam,3838,4.0,NV
93528,"8751 W Charleston Blvd, Ste 110",A0X1baHPgw9IiBRivu0G9g,"[Bakeries, French, Restaurants, Food]",Las Vegas,Patisserie Manon,598,4.0,NV


We see that this user really likes American Restaurants, Pizza, etc. He probably lives in Las vegas

##### Predict using basic averages

In [86]:
get_reccomendations("---1lKK3aKOuomHnwAkAow",5,avg_preds_df)

Unnamed: 0,predicted rating,address,business_id,categories,city,name,review_count,stars,state
135187,5.660683,"4627 E Ivy St, Ste 1",lH0Ph4DiYSqj9UJBXAq8hQ,"[Home Services, Local Services, Self Storage, ...",Mesa,Just-In Time Moving and Delivery,374,5.0,AZ
100304,5.660683,"2960 S Durango Dr, Ste 112",56_j_lcGj5X9SpM2KzLm4A,"[Laser Hair Removal, Beauty & Spas, Skin Care,...",Las Vegas,Fabulous Eyebrow Threading,453,5.0,NV
107956,5.660683,"7910 S Rainbow Blvd, Ste 110",Hp8k_RpSIWSeJguyaQpfIw,"[Gelato, Food, Desserts, Ice Cream & Frozen Yo...",Las Vegas,Gelatology,473,5.0,NV
87518,5.660683,10520 S Eastern Ave,Wcuo6YmYj3xhCso5sMQcOw,"[Pizza, Gluten-Free, Restaurants, Fast Food, S...",Henderson,Blaze Fast-Fire'd Pizza,364,4.5,NV
143283,5.660683,7608 W Cactus Rd,ZKsVCA89iXMccf3fEhS3iw,"[Restaurants, Seafood, Cajun/Creole, American ...",Peoria,Angry Crab Peoria,365,4.5,AZ


Our baseline model recomends only one restaurant with American food and a few places with weird categories.

##### Predict using lasso regression

In [88]:
get_reccomendations("---1lKK3aKOuomHnwAkAow",5,preds_array_reg_df)

Unnamed: 0,predicted rating,address,business_id,categories,city,name,review_count,stars,state
36525,4.71955,3799 Las Vegas Blvd S,XnJeadLrlj9AZB8qSdIR2Q,"[Restaurants, French]",Las Vegas,Joël Robuchon,831,4.5,NV
89310,4.710545,115 Federal St,X-b4-QvZLENnf3yFwhpSXQ,"[Baseball Fields, Stadiums & Arenas, Active Li...",Pittsburgh,PNC Park,426,4.5,PA
15676,4.675836,Flamingo Rd,ty5KQYqYRxwXDG_e4pz-4w,"[Arts & Entertainment, Performing Arts]",Las Vegas,Absinthe,1452,4.5,NV
95839,4.672092,3600 S Las Vegas Blvd,NCFwm2-TDb-oBQ2medmYDg,"[Street Art, Performing Arts, Public Services ...",Las Vegas,Fountains of Bellagio,1083,4.5,NV
87314,4.649546,,jeTvVMOR8W_04xFsPjzOEQ,"[Local Services, Movers, Home Services, Self S...",Phoenix,Camelback Moving,394,5.0,AZ


Our baseline model based on lasso predicts restaurants completely unrelated to users preferences. Bad... At least it's mostly in Las Vegas

##### Predict using matrix factorization

In [87]:
get_reccomendations("---1lKK3aKOuomHnwAkAow",5,fac_preds_df)

Unnamed: 0,predicted rating,address,business_id,categories,city,name,review_count,stars,state
43284,0.202872,"3400 E Sky Harbor Blvd, Ste 3300",JmI9nslLD7KZqRr__Bg6NQ,"[Hotels & Travel, Airports]",Phoenix,Phoenix Sky Harbor International Airport,2103,3.0,AZ
36211,0.12217,3950 S Las Vegas Blvd,Cni2l-VKG_pdospJ6xliXQ,"[Bars, Nightlife, Burgers, American (New), Res...",Las Vegas,Burger Bar,2396,4.0,NV
155316,0.091723,3355 South Las Vegas Boulevard,Wxxvi3LZbHNIDwJ-ZimtnA,"[Resorts, Arts & Entertainment, Event Planning...",Las Vegas,The Venetian Las Vegas,2951,4.0,NV
115819,0.06804,3131 Las Vegas Blvd S,MpmFFw0GE_2iRFPdsRpJbA,"[Nightlife, Dance Clubs]",Las Vegas,XS Nightclub,2848,4.0,NV
106982,0.056872,3799 Las Vegas Blvd S,El4FC8jcawUVgw_0EIcbaQ,"[Restaurants, Casinos, Hotels & Travel, Event ...",Las Vegas,MGM Grand Hotel,3285,3.0,NV
