In [1]:
import gzip
import math
import numpy as np
import random
import pandas as pd
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
import requests
import json
from sklearn.model_selection import train_test_split

In [2]:
# Run this if to download datafile to local
# URL of the gzipped JSON file
url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-Hawaii_10.json.gz"

# Define a local file to save the gzipped content
local_file = "review-Hawaii.json.gz"

# Download the file in chunks
with requests.get(url, stream=True) as response:
    response.raise_for_status()  # Raise an error if the download fails
    with open(local_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):  # Adjust chunk size as needed
            f.write(chunk)

In [212]:
# Decompress and load the JSON data
local_file = "review-Hawaii.json.gz"
dataset_review_Hawaii = []
with gzip.open(local_file, "rt", encoding="utf-8") as f:  # "rt" mode for text
    for line in f:
        data = json.loads(line)  # Parse each JSON object
        dataset_review_Hawaii.append(data)

# Output the length of the dataset to verify
print(f"Loaded {len(dataset_review_Hawaii)} reviews.")

In [137]:
review_Hawaii_clean = [i for i in dataset_review_Hawaii if i['text'] != None]
review_Hawaii_clean = pd.DataFrame(review_Hawaii_clean)
review_Hawaii_clean_eng = review_Hawaii_clean[review_Hawaii_clean['text'].str.match(r'\w')]
review_Hawaii_clean_eng

#### Data Analysis

In [138]:
review_Hawaii_feature = review_Hawaii_clean_eng[['user_id', 'gmap_id', 'text', 'rating']]
review_Hawaii_feature.head(100)

In [139]:
# Split the review_Hawaii_feature into train (80%) and test (20%) datasets
trainData, testData = train_test_split(review_Hawaii_feature, test_size=0.2, random_state=42)

# Optionally, you can reset indices of both DataFrames if needed
trainData.reset_index(drop=True, inplace=True)
testData.reset_index(drop=True, inplace=True)

### 1. sim - user_id v.s. gmap_id predict rating

In [140]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
# itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for _, d in trainData.iterrows():
    user,item = d['user_id'], d['gmap_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    ratingDict[(user,item)] = d['rating']
    # itemNames[item] = d['product_title']

In [141]:
def MSE(y_true, y_pred):
    differences = [(x-y)**2 for x,y in zip(y_true,y_pred)]
    return sum(differences) / len(differences)

In [142]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [143]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [144]:
ratingMean = sum([d['rating'] for _, d in trainData.iterrows()]) / len(trainData)

In [145]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

##### 1. Using user-user similarity with Jaccard predict

In [146]:
def predictRating_Jaccard(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [147]:
simPredictions = [predictRating_Jaccard(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [148]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
mse_jaccard_user = MSE(simPredictions, true_rating)

print("Jaccard User-based MSE: ", mse_jaccard_user)

##### 2. Using user-user similarity with Cosine predict

In [149]:
# based on user-user similarity with Cosine
def predictRating_cos(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Cosine(item,i2))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [150]:
simPredictions = [predictRating_cos(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [151]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
mse_consin_user = MSE(simPredictions, true_rating)

print("Cosine User-based MSE: ", mse_consin_user)

##### 3. Using user-user similarity with Jaccard predict with weight

In [152]:
#based on user-user similarity Jaccard with weight
def predictRating_Jaccard_weight(user,item, weight):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(pow(Jaccard(usersPerItem[item],usersPerItem[i2]), weight))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [153]:
weights = [0.1, 0.2, 0.3, 0.4, 0.5]
best_weight = None
best_MSE = None
for weight in weights:
    simPredictions = [predictRating_Jaccard_weight(d['user_id'], d['gmap_id'], weight) for _, d in testData.iterrows()]
    true_rating = [d['rating'] for _, d in testData.iterrows()]
    print(f"Weight: {weight}, MSE: {MSE(simPredictions, true_rating)}")
    if best_weight is None or MSE(simPredictions, true_rating) < best_MSE:
        best_weight = weight
        best_MSE = MSE(simPredictions, true_rating)

print(f"Best weight: {best_weight}")

mse_best_jaccard = best_MSE

### 2. text mining - text predict rating

#### 1. Bag-of-words models

In [154]:
import nltk
from nltk.corpus import stopwords

In [155]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [156]:
#Ignore capitalization and remove punctuation
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

for d in review_Hawaii_feature['text']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words:
            wordCount[w] += 1

len(wordCount)

In [157]:
count = []
for w in wordCount:
    count.append((wordCount[w],w))
count.sort(reverse=True)
words = [x[1] for x in count[:1000]]
words

In [158]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

##### Ridge Regression

###### using MSE to evaluate

In [159]:
for d in review_Hawaii_feature['text']:
    r = ''.join([c for c in d.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words:
            wordCount[w] += 1

In [160]:
def feature(datum):
    feat = [0] * len(words)
    r = ''.join([c for c in datum.lower() if not c in punctuation])
    for w in r.split():
        if w not in stop_words and w in words:
            feat[wordId[w]] += 1
            
    feat.append(1)
    return feat

In [161]:
X = [feature(d) for d in review_Hawaii_feature['text']]
y = [d for d in review_Hawaii_feature['rating']]

In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [163]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
predictions

In [164]:
mse_bg_ridge = MSE(y_test, predictions)
mse_bg_ridge

###### Try different evaluation method(Confusion Metric)

In [165]:
for i in range(len(y)):
    if y[i] >=4:
        y[i] = 'positive'
    elif y[i] <=2:
        y[i] = 'negative'
    else:
        y[i] = 'neutral'

In [166]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = linear_model.LogisticRegression()
clf.fit(X_train, y_train)
theta = clf.coef_
predictions = clf.predict(X_test)
correct = predictions == y_test
np.mean(correct)

In [167]:
def confusion_metric(y_true, y_pred, classes):
    total_correct = 0

    for cls in classes:
        TP = FP = FN = TN = 0  # Initialize counts for this class

        for true, pred in zip(y_true, y_pred):
            if true == cls and pred == cls:
                TP += 1  # True Positive
                total_correct += 1
            elif true != cls and pred == cls:
                FP += 1  # False Positive
            elif true == cls and pred != cls:
                FN += 1  # False Negative
            elif true != cls and pred != cls:
                TN += 1  # True Negative

        # Calculate precision and recall
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0

        print(f"Class '{cls}':")
        print(f"  True Positives (TP): {TP}")
        print(f"  False Positives (FP): {FP}")
        print(f"  False Negatives (FN): {FN}")
        print(f"  True Negatives (TN): {TN}")
        print(f"  Precision: {precision:.2f}")
        print(f"  Recall: {recall:.2f}\n")

classes = ["positive", "negative", "neutral"]

# Call the function
confusion_metric(y_test, predictions, classes)


##### Linear Regression

In [168]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [169]:
# preprocess text, remove punctuation, upper case, stop words
def preprocess_text(text):
    text = text.lower()
    text= ''.join(c for c in text if c not in string.punctuation)
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

review_Hawaii_feature['text'] = review_Hawaii_feature['text'].apply(preprocess_text)

In [170]:
vectorizer = CountVectorizer(max_features=2000)  # Convert text to bag-of-words representation
X = vectorizer.fit_transform(review_Hawaii_feature["text"])

y = review_Hawaii_feature["rating"]

In [171]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict 
y_pred = model.predict(X_test)
mse_BOW_LinearRegression = MSE(y_test, y_pred)
print("Mean Squared Error:", mse_BOW_LinearRegression)

In [172]:
# Get word weights from the model
weights = model.coef_
vocab = vectorizer.get_feature_names_out()
word_weights = dict(zip(vocab, weights))

# Display top 10 words with the highest weights
sorted_words = sorted(word_weights.items(), key=lambda x: x[1], reverse=True)
print("Top 10 words with highest weights:")
for word, weight in sorted_words[:10]:
    print(f"{word}: {weight:.4f}")

print("\nTop 10 words with lowest weights:")
for word, weight in sorted_words[-10:]:
    print(f"{word}: {weight:.4f}")

#### 2. TFIDF

##### TFIDF - LinearSVC

In [173]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import classification_report, accuracy_score

In [174]:
X = review_Hawaii_feature['text']
y = review_Hawaii_feature['rating']

In [175]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [176]:
tfidf = TfidfVectorizer(sublinear_tf=True, max_features=2000, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [177]:
linear_svr = LinearSVC()
linear_svr.fit(X_train_tfidf, y_train)

In [178]:
y_pred_train = linear_svr.predict(X_train_tfidf)
y_pred_test = linear_svr.predict(X_test_tfidf)

In [179]:
train_mse = MSE(y_train, y_pred_train)
test_mse = MSE(y_test, y_pred_test)

print(f"Train MSE: {train_mse:.2f}")
print(f"Test MSE: {test_mse:.2f}")

mse_linearsvc_tfidf = test_mse

In [180]:
print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))
print("\nClassification Report on Test Data:")
print(classification_report(y_test, y_pred_test))

##### TFIDF - Ridge Regression(Redundant)

In [181]:
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize

In [182]:
tfidf = TfidfVectorizer(sublinear_tf=True, analyzer='word', max_features=2000, tokenizer=word_tokenize, stop_words='english')

In [None]:
MR_train_X_tfidf, MR_test_X_tfidf, MR_train_Y_tfidf, MR_test_Y_tfidf = train_test_split(X, y, test_size=0.2, random_state=200)
# MR_train_X_tfidf = MR_train_X_tfidf
# MR_test_X_tfidf = MR_test_X_tfidf

In [184]:
MR_train_vector_tfidf = tfidf.fit_transform(MR_train_X_tfidf).toarray()
MR_test_vector_tfidf = tfidf.transform(MR_test_X_tfidf).toarray()
vocabulary_tfidf_train = tfidf.vocabulary_

In [185]:
MR_test_vector_tfidf

In [186]:
alpha = [0.1, 0.5, 1 , 5, 10, 50, 100]

In [187]:
best_MSE = None
best_alpha = None
for a in alpha:
    clf = linear_model.Ridge(a, fit_intercept=False) # MSE + 1.0 l2
    clf.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)
    predictions_tfidf = clf.predict(MR_test_vector_tfidf).clip(1, 5)
    mse = MSE(MR_test_Y_tfidf, predictions_tfidf)
    print(f"Alpha: {a}, MSE: {mse}")
    if best_alpha is None or mse < best_MSE:
        best_alpha = a
        best_MSE = mse

##### TFIDF - Linear Regression

In [188]:
from sklearn.linear_model import LinearRegression

In [189]:
linear_model = LinearRegression()
linear_model.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)

In [190]:
linear_predictions = linear_model.predict(MR_test_vector_tfidf).clip(1, 5)
mse_linear_reg_tfidf = MSE(MR_test_Y_tfidf, linear_predictions)

print(f"Linear Regression MSE: {mse_linear_reg_tfidf}")

##### TFIDF - SGDRegressor

In [191]:
from sklearn.linear_model import SGDRegressor

sgd_model = SGDRegressor(max_iter=1000, tol=1e-3)
sgd_model.fit(MR_train_vector_tfidf, MR_train_Y_tfidf)

In [192]:
sgd_model_predictions = sgd_model.predict(MR_test_vector_tfidf).clip(1, 5)
mse_sgd_tfidf = MSE(MR_test_Y_tfidf, sgd_model_predictions)

print(f"SGD Regression MSE: {mse_sgd_tfidf}")

#### 3. Latent Factor Model

##### Baseline Model

In [193]:
from surprise import Dataset, Reader
from surprise import SVDpp
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import BaselineOnly

In [194]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(review_Hawaii_feature[['user_id', 'gmap_id', 'rating']], reader)

In [195]:
trainset, testset = train_test_split(data, test_size=0.2)

In [196]:
baseline_model = BaselineOnly()
baseline_model.fit(trainset)

In [197]:
baseline_predictions = baseline_model.test(testset)
mse_baseline_LF = MSE([d.r_ui for d in baseline_predictions], [d.est for d in baseline_predictions])

print(f"Baseline MSE: {mse_baseline_LF}")

##### SVD++

In [198]:
param_grid = {
    'n_factors': [20, 50, 100],
    'lr_all': [0.01, 0.1, 0.5],
    'reg_all': [0.05, 0.1, 0.2]
}

In [199]:
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse'], cv=3, n_jobs=-1, joblib_verbose=2)

In [200]:
gs.fit(data)

In [201]:
# Train SVD++ using optimal parameters
best_params = gs.best_params['rmse']
model = SVDpp(n_factors=best_params['n_factors'], lr_all=best_params['lr_all'], reg_all=best_params['reg_all'])
model.fit(trainset)

In [202]:
predictions = model.test(testset)
mse_best_SVDpp = MSE([p.r_ui for p in predictions], [p.est for p in predictions])

print(f"Best parameters: {best_params}")
print(f"Test MSE: {mse_best_SVDpp}")

#### 4. Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
X = review_Hawaii_feature['text']
y = review_Hawaii_feature['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [None]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [None]:
mse_RFR = MSE(y_test, y_pred)