In [34]:
import gzip
import math
import numpy as np
import random
import pandas as pd
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
import requests
import json
from sklearn.model_selection import train_test_split

In [35]:
# Run this if to download datafile to local
# URL of the gzipped JSON file
url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-Hawaii_10.json.gz"

# Define a local file to save the gzipped content
local_file = "review-Hawaii.json.gz"

# Download the file in chunks
with requests.get(url, stream=True) as response:
    response.raise_for_status()  # Raise an error if the download fails
    with open(local_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):  # Adjust chunk size as needed
            f.write(chunk)

In [36]:
# Decompress and load the JSON data
local_file = "review-Hawaii.json.gz"
dataset_review_Hawaii = []
with gzip.open(local_file, "rt", encoding="utf-8") as f:  # "rt" mode for text
    for line in f:
        data = json.loads(line)  # Parse each JSON object
        dataset_review_Hawaii.append(data)

# Output the length of the dataset to verify
print(f"Loaded {len(dataset_review_Hawaii)} reviews.")

Loaded 1504347 reviews.


In [37]:
review_Hawaii_clean = [i for i in dataset_review_Hawaii if i['text'] != None]
review_Hawaii_clean = pd.DataFrame(review_Hawaii_clean)
review_Hawaii_clean_eng = review_Hawaii_clean[review_Hawaii_clean['text'].str.match(r'\w')]
review_Hawaii_clean_eng

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,113965417079576625433,manuel grimaldo,1591839903487,5,Great new upgrade,,,0x7c00159b5b1b1d25:0x8d2d85d4a758290e
1,109623613356773809039,Vicki Kach,1579559747146,5,So pleased to find Dr. Mike! He’s the real de...,,,0x7c006de89f2d86e1:0x23d998532e9317a6
2,105786704025048642479,Jessica Clopton,1545530647643,1,"The doctor is extremely creepy. First of all, ...",,,0x7c006de89f2d86e1:0x23d998532e9317a6
3,117458106933327014012,Robin Hanlin,1561877267351,5,As a former R.N. was looking for big shoes to ...,,"{'time': 1561923354957, 'text': 'Thank you so ...",0x7c006de89f2d86e1:0x23d998532e9317a6
4,108985244966294061730,Connie Mark,1580241584528,5,Great place! Doctor helped my body pains.,,,0x7c006de89f2d86e1:0x23d998532e9317a6
...,...,...,...,...,...,...,...,...
852568,107984868534067220088,Joshua Collier,1530714878067,2,Wish they would let you explore the area more ...,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852569,110628723873286096539,Stefano Parvoli,1528753144886,5,Amazing,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852570,107169846833534902263,Christine Lominario,1519620209920,5,Majestic 😊,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852571,101666345935879309455,Allana Kate,1517440381978,5,Amazing,[{'url': ['https://lh5.googleusercontent.com/p...,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a


#### Data Analysis

In [38]:
review_Hawaii_feature = review_Hawaii_clean_eng[['user_id', 'gmap_id', 'text', 'rating']]
review_Hawaii_feature.head(100)

Unnamed: 0,user_id,gmap_id,text,rating
0,113965417079576625433,0x7c00159b5b1b1d25:0x8d2d85d4a758290e,Great new upgrade,5
1,109623613356773809039,0x7c006de89f2d86e1:0x23d998532e9317a6,So pleased to find Dr. Mike! He’s the real de...,5
2,105786704025048642479,0x7c006de89f2d86e1:0x23d998532e9317a6,"The doctor is extremely creepy. First of all, ...",1
3,117458106933327014012,0x7c006de89f2d86e1:0x23d998532e9317a6,As a former R.N. was looking for big shoes to ...,5
4,108985244966294061730,0x7c006de89f2d86e1:0x23d998532e9317a6,Great place! Doctor helped my body pains.,5
...,...,...,...,...
98,113954467402806825801,0x795406d3728f9b1b:0x236996c8f711cda8,Great place,3
99,117257970158561722599,0x795406d3728f9b1b:0x236996c8f711cda8,Incredible,5
100,117345116162370485994,0x7c0015d64cd48c6f:0x4cac932764bd2fac,This Kitty Cafe was MEOWzing! There were so ma...,5
101,118397406534237711570,0x7c0015d64cd48c6f:0x4cac932764bd2fac,"Friendly owners, fun thematic decor, and lots ...",5


In [39]:
# Split the review_Hawaii_feature into train (80%) and test (20%) datasets
trainData, testData = train_test_split(review_Hawaii_feature, test_size=0.2, random_state=42)

# Optionally, you can reset indices of both DataFrames if needed
trainData.reset_index(drop=True, inplace=True)
testData.reset_index(drop=True, inplace=True)

### 1. sim - user_id v.s. gmap_id predict rating

In [40]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
# itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for _, d in trainData.iterrows():
    user,item = d['user_id'], d['gmap_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    ratingDict[(user,item)] = d['rating']
    # itemNames[item] = d['product_title']

In [41]:
def MSE(y_true, y_pred):
    differences = [(x-y)**2 for x,y in zip(y_true,y_pred)]
    return sum(differences) / len(differences)

In [42]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [43]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [44]:
ratingMean = sum([d['rating'] for _, d in trainData.iterrows()]) / len(trainData)

In [45]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

##### 1. Using user-user similarity with Jaccard predict

In [46]:
def predictRating_Jaccard(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [47]:
simPredictions = [predictRating_Jaccard(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [48]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
mse_jaccard_user = MSE(simPredictions, true_rating)

print("Jaccard User-based MSE: ", mse_jaccard_user)

Jaccard User-based MSE:  0.8033548695931364


##### 2. Using user-user similarity with Cosine predict

In [49]:
# based on user-user similarity with Cosine
def predictRating_cos(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Cosine(item,i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [50]:
simPredictions = [predictRating_cos(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [51]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
mse_consin_user = MSE(simPredictions, true_rating)

print("Cosine User-based MSE: ", mse_consin_user)

Cosine User-based MSE:  0.7981501661010038


##### 3. Using user-user similarity with Jaccard predict with weight

In [52]:
#based on user-user similarity Jaccard with weight
def predictRating_Jaccard_weight(user,item, weight):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(pow(Jaccard(usersPerItem[item],usersPerItem[i2]), weight))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [158]:
weights = [0.1, 0.2, 0.3, 0.4, 0.5]
best_weight = None
best_MSE = None
for weight in weights:
    simPredictions = [predictRating_Jaccard_weight(d['user_id'], d['gmap_id'], weight) for _, d in testData.iterrows()]
    true_rating = [d['rating'] for _, d in testData.iterrows()]
    print(f"Weight: {weight}, MSE: {MSE(simPredictions, true_rating)}")
    if best_weight is None or MSE(simPredictions, true_rating) < best_MSE:
        best_weight = weight
        best_MSE = MSE(simPredictions, true_rating)

print(f"Best weight: {best_weight}")

mse_best_jaccard = best_MSE

Weight: 0.1, MSE: 0.7861046558647632
Weight: 0.2, MSE: 0.7855306330473113
Weight: 0.3, MSE: 0.7857011791025986
Weight: 0.4, MSE: 0.7865753351540586
Weight: 0.5, MSE: 0.7880943934270958
Best weight: 0.2


### 2. text mining - text predict rating

#### 1. Bag-of-words models

In [54]:
import nltk
from nltk.corpus import stopwords

In [55]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [56]:
#Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

for d in review_Hawaii_feature['text']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words:
            wordCount[w] += 1

len(wordCount)

150685

In [57]:
count = []
for w in wordCount:
    count.append((wordCount[w],w))
count.sort(reverse=True)
words = [x[1] for x in count[:1000]]
words

['great',
 'food',
 'good',
 'place',
 'service',
 'nice',
 'get',
 'friendly',
 'best',
 'staff',
 'go',
 'one',
 'time',
 'beautiful',
 'like',
 'love',
 'beach',
 'always',
 'really',
 'amazing',
 'parking',
 'awesome',
 'also',
 'well',
 'little',
 'back',
 'delicious',
 'people',
 'would',
 'prices',
 'excellent',
 'island',
 'see',
 'clean',
 'dont',
 'got',
 'store',
 'worth',
 'location',
 'experience',
 'much',
 'lots',
 'even',
 'fun',
 'local',
 'view',
 'lot',
 'pretty',
 'small',
 'day',
 'us',
 'area',
 'fresh',
 'restaurant',
 'everything',
 'price',
 'wait',
 'selection',
 'recommend',
 'fish',
 'definitely',
 'coffee',
 'better',
 'make',
 'need',
 'eat',
 'take',
 'way',
 'went',
 'order',
 'fast',
 'around',
 'views',
 'water',
 'helpful',
 'visit',
 'many',
 'right',
 'super',
 'try',
 'come',
 'park',
 'spot',
 'find',
 'big',
 'chicken',
 'bit',
 'shop',
 'drinks',
 'long',
 'favorite',
 'must',
 'family',
 'hawaii',
 'could',
 'menu',
 'going',
 'made',
 'first',

In [58]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

##### Ridge Regression

###### using MSE to evaluate

In [59]:
for d in review_Hawaii_feature['text']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words:
            wordCount[w] += 1

In [60]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words and w in words:
            feat[wordId[w]] += 1
            
    feat.append(1)
    return feat

In [62]:
X = [feature(d) for d in review_Hawaii_feature['text']]
y = [d for d in review_Hawaii_feature['rating']]

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
predictions

array([4.2860508 , 4.20092677, 3.50004001, ..., 4.33104579, 3.91515342,
       4.1513368 ])

In [65]:
mse_bg_ridge = MSE(y_test, predictions)
mse_bg_ridge

0.6498943202691914

###### Try different evaluation method(Confusion Metric)

In [66]:
for i in range(len(y)):
    if y[i] >=4:
        y[i] = 'positive'
    elif y[i] <=2:
        y[i] = 'negative'
    else:
        y[i] = 'neutral'

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
correct = predictions == y_test
np.mean(correct)

0.875780873033744

In [68]:
def confusion_metric(y_true, y_pred, classes):
    total_correct = 0

    for cls in classes:
        TP = FP = FN = TN = 0  # Initialize counts for this class

        for true, pred in zip(y_true, y_pred):
            if true == cls and pred == cls:
                TP += 1  # True Positive
                total_correct += 1
            elif true != cls and pred == cls:
                FP += 1  # False Positive
            elif true == cls and pred != cls:
                FN += 1  # False Negative
            elif true != cls and pred != cls:
                TN += 1  # True Negative

        # Calculate precision and recall
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0

        print(f"Class '{cls}':")
        print(f"  True Positives (TP): {TP}")
        print(f"  False Positives (FP): {FP}")
        print(f"  False Negatives (FN): {FN}")
        print(f"  True Negatives (TN): {TN}")
        print(f"  Precision: {precision:.2f}")
        print(f"  Recall: {recall:.2f}\n")

classes = ["positive", "negative", "neutral"]

# Call the function
confusion_metric(y_test, predictions, classes)


Class 'positive':
  True Positives (TP): 130921
  False Positives (FP): 15615
  False Negatives (FN): 1993
  True Negatives (TN): 6906
  Precision: 0.89
  Recall: 0.99

Class 'negative':
  True Positives (TP): 3308
  False Positives (FP): 1641
  False Negatives (FN): 5649
  True Negatives (TN): 144837
  Precision: 0.67
  Recall: 0.37

Class 'neutral':
  True Positives (TP): 1898
  False Positives (FP): 2052
  False Negatives (FN): 11666
  True Negatives (TN): 139819
  Precision: 0.48
  Recall: 0.14



##### Linear Regression

In [69]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [70]:
# preprocess text, remove punctuation, upper case, stop words
def preprocess_text(text):
    text = text.lower()
    text= ''.join(c for c in text if c not in string.punctuation)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

review_Hawaii_feature['text'] = review_Hawaii_feature['text'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review_Hawaii_feature['text'] = review_Hawaii_feature['text'].apply(preprocess_text)


In [71]:
vectorizer = CountVectorizer(max_features=2000)  # Convert text to bag-of-words representation
X = vectorizer.fit_transform(review_Hawaii_feature["text"])

y = review_Hawaii_feature["rating"]

In [72]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict 
y_pred = model.predict(X_test)
mse_BOW_LinearRegression = MSE(y_test, y_pred)
print("Mean Squared Error:", mse_BOW_LinearRegression)

Mean Squared Error: 0.6281887058220773


In [73]:
# Get word weights from the model
weights = model.coef_
vocab = vectorizer.get_feature_names_out()
word_weights = dict(zip(vocab, weights))

# Display top 10 words with the highest weights
sorted_words = sorted(word_weights.items(), key=lambda x: x[1], reverse=True)
print("Top 10 words with highest weights:")
for word, weight in sorted_words[:10]:
    print(f"{word}: {weight:.4f}")

print("\nTop 10 words with lowest weights:")
for word, weight in sorted_words[-10:]:
    print(f"{word}: {weight:.4f}")

Top 10 words with highest weights:
awsome: 0.4336
reasonably: 0.3961
awesome: 0.3270
excellent: 0.2964
heaven: 0.2899
onolicious: 0.2879
breathtaking: 0.2837
best: 0.2835
beautiful: 0.2766
breath: 0.2763

Top 10 words with lowest weights:
mediocre: -0.7756
dirty: -0.8091
awful: -0.8451
overpriced: -0.8502
sucks: -0.9931
poor: -1.0007
rude: -1.0034
terrible: -1.0250
horrible: -1.1107
worst: -1.1709


#### 2. TFIDF

##### TFIDF - LinearSVC

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, accuracy_score

In [144]:
X = review_Hawaii_feature['text']
y = review_Hawaii_feature['rating']

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
tfidf = TfidfVectorizer(sublinear_tf=True, max_features=2000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [80]:
linear_svr = LinearSVC()
linear_svr.fit(X_train_tfidf, y_train)

In [81]:
y_pred_train = linear_svr.predict(X_train_tfidf)
y_pred_test = linear_svr.predict(X_test_tfidf)

In [82]:
train_mse = MSE(y_train, y_pred_train)
test_mse = MSE(y_test, y_pred_test)

print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")

mse_linearsvc_tfidf = test_mse

Train MSE: 0.82
Test MSE: 0.82


In [83]:
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.6704131476391219
Testing Accuracy: 0.6687039598546016

Classification Report on Test Data:
              precision    recall  f1-score   support

           1       0.52      0.41      0.46      4527
           2       0.37      0.01      0.02      4430
           3       0.45      0.19      0.27     13564
           4       0.45      0.17      0.25     35525
           5       0.71      0.96      0.81     97389

    accuracy                           0.67    155435
   macro avg       0.50      0.35      0.36    155435
weighted avg       0.61      0.67      0.60    155435



##### TFIDF - Ridge Regression(Redundant)

In [84]:
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\79250\AppData\Roaming\nltk_data...
[

In [85]:
tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000, tokenizer=word_tokenize, stop_words='english')

In [86]:
MR_train_X_tfidf, MR_test_X_tfidf, MR_train_Y_tfidf, MR_test_Y_tfidf = train_test_split(X, y, test_size=0.2, random_state=200)
# MR_train_X_tfidf = MR_train_X_tfidf
# MR_test_X_tfidf = MR_test_X_tfidf

In [87]:
MR_train_vector_tfidf = tfidf.fit_transform(MR_train_X_tfidf).toarray()
MR_test_vector_tfidf = tfidf.transform(MR_test_X_tfidf).toarray()
vocabulary_tfidf_train = tfidf.vocabulary_



In [88]:
len(MR_train_vector_tfidf)

621739

In [89]:
alpha = [0.1, 0.5, 1 , 5, 10, 50, 100]

In [None]:
best_MSE = None
best_alpha = None
for a in alpha:
    clf = linear_model.Ridge(a, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)
    predictions_tfidf = clf.predict(MR_test_vector_tfidf).clip(1, 5)
    mse = MSE(MR_test_Y_tfidf, predictions_tfidf)
    print(f"Alpha: {a}, MSE: {mse}")
    if best_alpha is None or mse < best_MSE:
        best_alpha = a
        best_MSE = mse

##### TFIDF - Linear Regression

In [90]:
from sklearn.linear_model import LinearRegression

In [91]:
linear_model = LinearRegression()
linear_model.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)

In [92]:
linear_predictions = linear_model.predict(MR_test_vector_tfidf).clip(1, 5)
mse_linear_reg_tfidf = MSE(MR_test_Y_tfidf, linear_predictions)

print(f"Linear Regression MSE: {mse_linear_reg_tfidf}")

Linear Regression MSE: 0.5592757216596443


##### TFIDF - SGDRegressor

In [93]:
from sklearn.linear_model import SGDRegressor

sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_model.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)

In [94]:
sgd_model_predictions = sgd_model.predict(MR_test_vector_tfidf).clip(1, 5)
mse_sgd_tfidf = MSE(MR_test_Y_tfidf, sgd_model_predictions)

print(f"SGD Regression MSE: {mse_sgd_tfidf}")

SGD Regression MSE: 0.5840505112486768


#### 3. Latent Factor Model

##### Baseline Model

In [95]:
from surprise import Dataset, Reader
from surprise import SVDpp
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import BaselineOnly

In [96]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(review_Hawaii_feature[['user_id', 'gmap_id', 'rating']], reader)

In [97]:
trainset, testset = train_test_split(data, test_size=0.2)

In [98]:
baseline_model = BaselineOnly()
baseline_model.fit(trainset)

Estimating biases using als...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x1f759759450>

In [99]:
baseline_predictions = baseline_model.test(testset)
mse_baseline_LF = MSE([d.r_ui for d in baseline_predictions], [d.est for d in baseline_predictions])

print(f"Baseline MSE: {mse_baseline_LF}")

Baseline MSE: 0.7369060770317046


##### SVD++

In [159]:
param_grid = {
    'n_factors': [20, 50, 100],
    'lr_all': [0.01, 0.1, 0.5],
    'reg_all': [0.05, 0.1, 0.2]
}

In [160]:
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3, n_jobs=-1, joblib_verbose=2)

In [161]:
gs.fit(data)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed: 11.7min finished


In [162]:
# Train SVD++ using optimal parameters
best_params = gs.best_params['rmse']
model = SVDpp(n_factors=best_params['n_factors'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1fbe1ab8590>

In [163]:
predictions = model.test(testset)
mse_best_SVDpp = MSE([p.r_ui for p in predictions], [p.est for p in predictions])

print(f"Best parameters: {best_params}")
print(f"Test MSE: {mse_best_SVDpp}")

Best parameters: {'n_factors': 100, 'lr_all': 0.01, 'reg_all': 0.1}
Test MSE: 0.7195678523950972


#### Bi-GRU

##### Random Initialization

In [116]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [117]:
# Step 1: Preprocessing
max_words = 20000  # Vocabulary size
max_len = 100      # Maximum sequence length

# Tokenizing text
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [145]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

In [146]:
# Step 2: Load GloVe Embeddings
embedding_dim = 100
embedding_index = {}

In [147]:
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coef

In [148]:
# Create embedding matrix
word_index = tokenizer.word_index
num_words = min(max_words, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embedding_dim))

In [149]:
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
# Step 3: Build Bi-GRU Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(1, activation='linear')
])



In [None]:
model.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=['mae'])

In [152]:
# Step 4: Train the Model
history = model.fit(X_train, y_train,
                    epochs=1,
                    batch_size=32,
                    validation_data=(X_test, y_test))

[1m19430/19430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1236s[0m 63ms/step - loss: 0.6102 - mae: 0.5480 - val_loss: 0.4517 - val_mae: 0.5043


In [153]:
bi_gur_predictions = model.predict(X_test).flatten()

[1m4858/4858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 14ms/step


In [155]:
mse_bi_gru = MSE(y_test, bi_gur_predictions)

print(f"Bi-GRU MSE: {mse_bi_gru}")

Bi-GRU MSE: 0.4516746387910339


##### With GloVe Embeddings

In [174]:
model_gloVe = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=num_words, 
        output_dim=embedding_dim, 
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=True
    ),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(1, activation='linear')
])

In [175]:
model_gloVe.compile(optimizer='adam', 
              loss='mean_squared_error',
              metrics=['mae'])

In [176]:
history_gloVe = model_gloVe.fit(X_train, y_train,
                    epochs=1,
                    batch_size=32,
                    validation_data=(X_test, y_test))

[1m19430/19430[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1249s[0m 64ms/step - loss: 0.5729 - mae: 0.5332 - val_loss: 0.4310 - val_mae: 0.4749


In [177]:
bigur_glove_predictions = model_gloVe.predict(X_test).flatten()

[1m4858/4858[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 15ms/step


In [178]:
bigur_glove_mse = MSE(y_test, bigur_glove_predictions)

print(f"Bi-GRU with GloVe MSE: {bigur_glove_mse}")

Bi-GRU with GloVe MSE: 0.43103880629324587
