In [2]:
import gzip
import math
import numpy as np
import random
import pandas as pd
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
import requests
import json
from sklearn.model_selection import train_test_split


In [3]:
# Run this if to download datafile to local
# URL of the gzipped JSON file
url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-Hawaii_10.json.gz"

# Define a local file to save the gzipped content
local_file = "review-Hawaii.json.gz"

# Download the file in chunks
with requests.get(url, stream=True) as response:
    response.raise_for_status()  # Raise an error if the download fails
    with open(local_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):  # Adjust chunk size as needed
            f.write(chunk)

In [4]:
# Decompress and load the JSON data
local_file = "review-Hawaii.json.gz"
dataset_review_Hawaii = []
with gzip.open(local_file, "rt", encoding="utf-8") as f:  # "rt" mode for text
    for line in f:
        data = json.loads(line)  # Parse each JSON object
        dataset_review_Hawaii.append(data)

# Output the length of the dataset to verify
print(f"Loaded {len(dataset_review_Hawaii)} reviews.")

Loaded 1504347 reviews.


In [5]:
review_Hawaii_clean = [i for i in dataset_review_Hawaii if i['text'] != None]
review_Hawaii_clean = pd.DataFrame(review_Hawaii_clean)
review_Hawaii_clean_eng = review_Hawaii_clean[review_Hawaii_clean['text'].str.match(r'\w')]
review_Hawaii_clean_eng

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,113965417079576625433,manuel grimaldo,1591839903487,5,Great new upgrade,,,0x7c00159b5b1b1d25:0x8d2d85d4a758290e
1,109623613356773809039,Vicki Kach,1579559747146,5,So pleased to find Dr. Mike! He’s the real de...,,,0x7c006de89f2d86e1:0x23d998532e9317a6
2,105786704025048642479,Jessica Clopton,1545530647643,1,"The doctor is extremely creepy. First of all, ...",,,0x7c006de89f2d86e1:0x23d998532e9317a6
3,117458106933327014012,Robin Hanlin,1561877267351,5,As a former R.N. was looking for big shoes to ...,,"{'time': 1561923354957, 'text': 'Thank you so ...",0x7c006de89f2d86e1:0x23d998532e9317a6
4,108985244966294061730,Connie Mark,1580241584528,5,Great place! Doctor helped my body pains.,,,0x7c006de89f2d86e1:0x23d998532e9317a6
...,...,...,...,...,...,...,...,...
852568,107984868534067220088,Joshua Collier,1530714878067,2,Wish they would let you explore the area more ...,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852569,110628723873286096539,Stefano Parvoli,1528753144886,5,Amazing,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852570,107169846833534902263,Christine Lominario,1519620209920,5,Majestic 😊,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852571,101666345935879309455,Allana Kate,1517440381978,5,Amazing,[{'url': ['https://lh5.googleusercontent.com/p...,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a


In [6]:
review_Hawaii_feature = review_Hawaii_clean_eng[['user_id', 'gmap_id', 'text', 'rating']]
review_Hawaii_feature.head(100)

Unnamed: 0,user_id,gmap_id,text,rating
0,113965417079576625433,0x7c00159b5b1b1d25:0x8d2d85d4a758290e,Great new upgrade,5
1,109623613356773809039,0x7c006de89f2d86e1:0x23d998532e9317a6,So pleased to find Dr. Mike! He’s the real de...,5
2,105786704025048642479,0x7c006de89f2d86e1:0x23d998532e9317a6,"The doctor is extremely creepy. First of all, ...",1
3,117458106933327014012,0x7c006de89f2d86e1:0x23d998532e9317a6,As a former R.N. was looking for big shoes to ...,5
4,108985244966294061730,0x7c006de89f2d86e1:0x23d998532e9317a6,Great place! Doctor helped my body pains.,5
...,...,...,...,...
98,113954467402806825801,0x795406d3728f9b1b:0x236996c8f711cda8,Great place,3
99,117257970158561722599,0x795406d3728f9b1b:0x236996c8f711cda8,Incredible,5
100,117345116162370485994,0x7c0015d64cd48c6f:0x4cac932764bd2fac,This Kitty Cafe was MEOWzing! There were so ma...,5
101,118397406534237711570,0x7c0015d64cd48c6f:0x4cac932764bd2fac,"Friendly owners, fun thematic decor, and lots ...",5


In [7]:
# Split the review_Hawaii_feature into train (90%) and test (10%) datasets
trainData, testData = train_test_split(review_Hawaii_feature, test_size=0.1, random_state=42)

# Optionally, you can reset indices of both DataFrames if needed
trainData.reset_index(drop=True, inplace=True)
testData.reset_index(drop=True, inplace=True)

### 1. sim - user_id v.s. gmap_id predict rating

In [8]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
# itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for _, d in trainData.iterrows():
    user,item = d['user_id'], d['gmap_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    ratingDict[(user,item)] = d['rating']
    # itemNames[item] = d['product_title']

In [9]:
def MSE(y_true, y_pred):
    differences = [(x-y)**2 for x,y in zip(y_true,y_pred)]
    return sum(differences) / len(differences)

In [10]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [11]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [12]:
ratingMean = sum([d['rating'] for _, d in trainData.iterrows()]) / len(trainData)

In [13]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [14]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [15]:
u, i = testData.iloc[1]['user_id'], testData.iloc[1]['gmap_id']
predictRating(u, i)

5

In [16]:
# for _, d in testData.iterrows():
#     u, i = d['user_id'], d['gmap_id']
#     predictRating(u, i)
#     d['rating']

In [17]:
simPredictions = [predictRating(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [18]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
MSE(simPredictions, true_rating)

0.7860738293802693

Using user-user similarity with Cosine predict

In [19]:
# based on user-user similarity with Cosine
def predictRating_cos(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Cosine(item,i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [20]:
simPredictions = [predictRating_cos(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [21]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
MSE(simPredictions, true_rating)

0.7812147524947816

In [22]:
#based on user-user similarity Jaccard with weight
def predictRating(user,item, weight):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(pow(Jaccard(usersPerItem[item],usersPerItem[i2]), weight))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [23]:
weights = [0.1, 0.2, 0.3, 0.4, 0.5]
best_weight = None
best_MSE = None
for weight in weights:
    simPredictions = [predictRating(d['user_id'], d['gmap_id'], weight) for _, d in testData.iterrows()]
    true_rating = [d['rating'] for _, d in testData.iterrows()]
    print(f"Weight: {weight}, MSE: {MSE(simPredictions, true_rating)}")
    if best_weight is None or MSE(simPredictions, true_rating) < best_MSE:
        best_weight = weight
        best_MSE = MSE(simPredictions, true_rating)

print(f"Best weight: {best_weight}")

Weight: 0.1, MSE: 0.7712785258316813
Weight: 0.2, MSE: 0.7704268202598192
Weight: 0.3, MSE: 0.7702953403481975
Weight: 0.4, MSE: 0.7708476084532566
Weight: 0.5, MSE: 0.7720278034033904
Best weight: 0.3


### 2. text mining - text predict rating

#### 1. Bag-of-words models

In [24]:
import nltk
from nltk.corpus import stopwords

In [25]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
stop_words = set(stopwords.words('english'))

In [27]:
#Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

for d in review_Hawaii_feature['text']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words:
            wordCount[w] += 1

len(wordCount)

150685

In [28]:
count = []
for w in wordCount:
    count.append((wordCount[w],w))
count.sort(reverse=True)
words = [x[1] for x in count[:1000]]
words

['great',
 'food',
 'good',
 'place',
 'service',
 'nice',
 'get',
 'friendly',
 'best',
 'staff',
 'go',
 'one',
 'time',
 'beautiful',
 'like',
 'love',
 'beach',
 'always',
 'really',
 'amazing',
 'parking',
 'awesome',
 'also',
 'well',
 'little',
 'back',
 'delicious',
 'people',
 'would',
 'prices',
 'excellent',
 'island',
 'see',
 'clean',
 'dont',
 'got',
 'store',
 'worth',
 'location',
 'experience',
 'much',
 'lots',
 'even',
 'fun',
 'local',
 'view',
 'lot',
 'pretty',
 'small',
 'day',
 'us',
 'area',
 'fresh',
 'restaurant',
 'everything',
 'price',
 'wait',
 'selection',
 'recommend',
 'fish',
 'definitely',
 'coffee',
 'better',
 'make',
 'need',
 'eat',
 'take',
 'way',
 'went',
 'order',
 'fast',
 'around',
 'views',
 'water',
 'helpful',
 'visit',
 'many',
 'right',
 'super',
 'try',
 'come',
 'park',
 'spot',
 'find',
 'big',
 'chicken',
 'bit',
 'shop',
 'drinks',
 'long',
 'favorite',
 'must',
 'family',
 'hawaii',
 'could',
 'menu',
 'going',
 'made',
 'first',

In [29]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [30]:
for d in review_Hawaii_feature['text']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words:
            wordCount[w] += 1

In [31]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words and w in words:
            feat[wordId[w]] += 1
            
    feat.append(1)
    return feat

In [32]:
X = [feature(d) for d in review_Hawaii_feature['text']]
y = [d for d in review_Hawaii_feature['rating']]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
predictions

array([4.2860508 , 4.20092677, 3.50004001, ..., 4.33104579, 3.91515342,
       4.1513368 ])

In [35]:
mse = MSE(y_test, predictions)
mse

0.6498943202691914

In [36]:
y = [d for d in review_Hawaii_feature['rating']]

In [37]:
for i in range(len(y)):
    if y[i] >=4:
        y[i] = 'positive'
    elif y[i] <=2:
        y[i] = 'negative'
    else:
        y[i] = 'neutral'

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
correct = predictions == y_test
np.mean(correct)

0.875780873033744

In [39]:
def confusion_metric(y_true, y_pred, classes):
    total_correct = 0

    for cls in classes:
        TP = FP = FN = TN = 0  # Initialize counts for this class

        for true, pred in zip(y_true, y_pred):
            if true == cls and pred == cls:
                TP += 1  # True Positive
                total_correct += 1
            elif true != cls and pred == cls:
                FP += 1  # False Positive
            elif true == cls and pred != cls:
                FN += 1  # False Negative
            elif true != cls and pred != cls:
                TN += 1  # True Negative

        # Calculate precision and recall
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0

        print(f"Class '{cls}':")
        print(f"  True Positives (TP): {TP}")
        print(f"  False Positives (FP): {FP}")
        print(f"  False Negatives (FN): {FN}")
        print(f"  True Negatives (TN): {TN}")
        print(f"  Precision: {precision:.2f}")
        print(f"  Recall: {recall:.2f}\n")

classes = ["positive", "negative", "neutral"]

# Call the function
confusion_metric(y_test, predictions, classes)


Class 'positive':
  True Positives (TP): 130921
  False Positives (FP): 15615
  False Negatives (FN): 1993
  True Negatives (TN): 6906
  Precision: 0.89
  Recall: 0.99

Class 'negative':
  True Positives (TP): 3308
  False Positives (FP): 1641
  False Negatives (FN): 5649
  True Negatives (TN): 144837
  Precision: 0.67
  Recall: 0.37

Class 'neutral':
  True Positives (TP): 1898
  False Positives (FP): 2052
  False Negatives (FN): 11666
  True Negatives (TN): 139819
  Precision: 0.48
  Recall: 0.14



#### 2. TFIDF

##### TFIDF - LinearSVC

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, accuracy_score

In [74]:
X = review_Hawaii_feature['text']
y = review_Hawaii_feature['rating']

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [76]:
tfidf = TfidfVectorizer(sublinear_tf=True, max_features=2000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [77]:
linear_svr = LinearSVC()
linear_svr.fit(X_train_tfidf, y_train)

In [78]:
y_pred_train = linear_svr.predict(X_train_tfidf)
y_pred_test = linear_svr.predict(X_test_tfidf)

In [79]:
y_pred_train

array([5, 5, 5, ..., 5, 4, 5], dtype=int64)

In [80]:
y_pred_test

array([5, 5, 3, ..., 5, 5, 4], dtype=int64)

In [81]:
train_mse = MSE(y_train, y_pred_train)
test_mse = MSE(y_test, y_pred_test)

print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")

Train MSE: 0.82
Test MSE: 0.83


In [49]:
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.6709953855235075
Testing Accuracy: 0.6688840994627979

Classification Report on Test Data:
              precision    recall  f1-score   support

           1       0.51      0.41      0.46      4527
           2       0.36      0.01      0.02      4430
           3       0.46      0.20      0.27     13564
           4       0.45      0.17      0.25     35525
           5       0.71      0.96      0.81     97389

    accuracy                           0.67    155435
   macro avg       0.50      0.35      0.36    155435
weighted avg       0.61      0.67      0.60    155435



##### TFIDF - Ridge Regression

In [50]:
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[

In [82]:
tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000, tokenizer=word_tokenize, stop_words='english')

In [84]:
MR_train_X_tfidf, MR_test_X_tfidf, MR_train_Y_tfidf, MR_test_Y_tfidf = train_test_split(X, y, test_size=0.1, random_state=200)
MR_train_X_tfidf = MR_train_X_tfidf
MR_test_X_tfidf = MR_test_X_tfidf

In [85]:
MR_train_vector_tfidf = tfidf.fit_transform(MR_train_X_tfidf).toarray()
MR_test_vector_tfidf = tfidf.transform(MR_test_X_tfidf).toarray()
vocabulary_tfidf_train = tfidf.vocabulary_



In [86]:
MR_test_vector_tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.30403789, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [87]:
alpha = [0.1, 0.5, 1 , 5, 10, 50, 100]

In [88]:
best_MSE = None
best_alpha = None
for a in alpha:
    clf = linear_model.Ridge(a, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)
    predictions_tfidf = clf.predict(MR_test_vector_tfidf).clip(1, 5)
    mse = MSE(MR_test_Y_tfidf, predictions_tfidf)
    print(f"Alpha: {a}, MSE: {mse}")
    if best_alpha is None or mse < best_MSE:
        best_alpha = a
        best_MSE = mse

AttributeError: 'LinearRegression' object has no attribute 'Ridge'

##### TFIDF - Linear Regression

In [89]:
from sklearn.linear_model import LinearRegression

In [90]:
linear_model = LinearRegression()
linear_model.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)

In [91]:
linear_predictions = linear_model.predict(MR_test_vector_tfidf).clip(1, 5)
mse = MSE(MR_test_Y_tfidf, linear_predictions)

print(f"Linear Regression MSE: {mse}")

Linear Regression MSE: 0.5551967362014119


##### TFIDF - SGDRegressor

In [92]:
from sklearn.linear_model import SGDRegressor

sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_model.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)

In [93]:
sgd_model_predictions = sgd_model.predict(MR_test_vector_tfidf).clip(1, 5)
mse = MSE(MR_test_Y_tfidf, sgd_model_predictions)

print(f"SGD Regression MSE: {mse}")

SGD Regression MSE: 0.5789793858518669


#### Latent Factor Model

##### Baseline Model

In [112]:
from surprise import Dataset, Reader
from surprise import SVDpp
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import BaselineOnly
from surprise import accuracy

In [98]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(review_Hawaii_feature[['user_id', 'gmap_id', 'rating']], reader)

In [109]:
trainset, testset = train_test_split(data, test_size=0.2)

In [110]:
baseline_model = BaselineOnly()
baseline_model.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1d3a6adb710>

In [115]:
baseline_predictions = baseline_model.test(testset)
baseline_mse = MSE([d.r_ui for d in baseline_predictions], [d.est for d in baseline_predictions])

print(f"Baseline MSE: {baseline_mse}")

Baseline MSE: 0.7442353556578654


##### SVD++

In [None]:
param_grid = {
    'n_factors': [20, 50, 100],
    'lr_all': [0.01, 0.1, 0.5],
    'reg_all': [0.05, 0.1, 0.2]
}

In [131]:
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3, n_jobs=-1, joblib_verbose=2)

In [132]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:   23.1s remaining:   23.1s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed:   25.5s finished


In [133]:
# Train SVD++ using optimal parameters
best_params = gs.best_params['rmse']
model = SVDpp(n_factors=best_params['n_factors'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1d5fb829710>

In [129]:
predictions = model.test(testset)
mse = MSE([p.r_ui for p in predictions], [p.est for p in predictions])

print(f"Best parameters: {best_params}")
print(f"Test MSE: {mse}")

Best parameters: {'n_factors': 6, 'lr_all': 0.01, 'reg_all': 1e-05}
Test MSE: 0.7955681271461569
