In [None]:
import gzip
import math
import numpy
import random
import pandas as pd
import sklearn
import string
from collections import defaultdict
from nltk.stem.porter import *
from sklearn import linear_model
from gensim.models import Word2Vec
import dateutil
from scipy.sparse import lil_matrix # To build sparse feature matrices, if you like
import requests
import json
from sklearn.model_selection import train_test_split

In [None]:
# Run this if to download datafile to local
# URL of the gzipped JSON file
url = "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/review-Hawaii_10.json.gz"

# Define a local file to save the gzipped content
local_file = "review-Hawaii.json.gz"

# Download the file in chunks
with requests.get(url, stream=True) as response:
    response.raise_for_status()  # Raise an error if the download fails
    with open(local_file, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):  # Adjust chunk size as needed
            f.write(chunk)

In [133]:
# Decompress and load the JSON data
local_file = "review-Hawaii.json.gz"
dataset_review_Hawaii = []
with gzip.open(local_file, "rt", encoding="utf-8") as f:  # "rt" mode for text
    for line in f:
        data = json.loads(line)  # Parse each JSON object
        dataset_review_Hawaii.append(data)

# Output the length of the dataset to verify
print(f"Loaded {len(dataset_review_Hawaii)} reviews.")

Loaded 1504347 reviews.


In [134]:
review_Hawaii_clean = [i for i in dataset_review_Hawaii if i['text'] != None]
review_Hawaii_clean = pd.DataFrame(review_Hawaii_clean)
review_Hawaii_clean_eng = review_Hawaii_clean[review_Hawaii_clean['text'].str.match(r'\w')]
review_Hawaii_clean_eng

Unnamed: 0,user_id,name,time,rating,text,pics,resp,gmap_id
0,113965417079576625433,manuel grimaldo,1591839903487,5,Great new upgrade,,,0x7c00159b5b1b1d25:0x8d2d85d4a758290e
1,109623613356773809039,Vicki Kach,1579559747146,5,So pleased to find Dr. Mike! He’s the real de...,,,0x7c006de89f2d86e1:0x23d998532e9317a6
2,105786704025048642479,Jessica Clopton,1545530647643,1,"The doctor is extremely creepy. First of all, ...",,,0x7c006de89f2d86e1:0x23d998532e9317a6
3,117458106933327014012,Robin Hanlin,1561877267351,5,As a former R.N. was looking for big shoes to ...,,"{'time': 1561923354957, 'text': 'Thank you so ...",0x7c006de89f2d86e1:0x23d998532e9317a6
4,108985244966294061730,Connie Mark,1580241584528,5,Great place! Doctor helped my body pains.,,,0x7c006de89f2d86e1:0x23d998532e9317a6
...,...,...,...,...,...,...,...,...
852568,107984868534067220088,Joshua Collier,1530714878067,2,Wish they would let you explore the area more ...,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852569,110628723873286096539,Stefano Parvoli,1528753144886,5,Amazing,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852570,107169846833534902263,Christine Lominario,1519620209920,5,Majestic 😊,,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a
852571,101666345935879309455,Allana Kate,1517440381978,5,Amazing,[{'url': ['https://lh5.googleusercontent.com/p...,,0x7953b4a4114e37f7:0x374b5a1f84f48a1a


In [135]:
review_Hawaii_feature = review_Hawaii_clean_eng[['user_id', 'gmap_id', 'text', 'rating']]
review_Hawaii_feature.head(100)

Unnamed: 0,user_id,gmap_id,text,rating
0,113965417079576625433,0x7c00159b5b1b1d25:0x8d2d85d4a758290e,Great new upgrade,5
1,109623613356773809039,0x7c006de89f2d86e1:0x23d998532e9317a6,So pleased to find Dr. Mike! He’s the real de...,5
2,105786704025048642479,0x7c006de89f2d86e1:0x23d998532e9317a6,"The doctor is extremely creepy. First of all, ...",1
3,117458106933327014012,0x7c006de89f2d86e1:0x23d998532e9317a6,As a former R.N. was looking for big shoes to ...,5
4,108985244966294061730,0x7c006de89f2d86e1:0x23d998532e9317a6,Great place! Doctor helped my body pains.,5
...,...,...,...,...
98,113954467402806825801,0x795406d3728f9b1b:0x236996c8f711cda8,Great place,3
99,117257970158561722599,0x795406d3728f9b1b:0x236996c8f711cda8,Incredible,5
100,117345116162370485994,0x7c0015d64cd48c6f:0x4cac932764bd2fac,This Kitty Cafe was MEOWzing! There were so ma...,5
101,118397406534237711570,0x7c0015d64cd48c6f:0x4cac932764bd2fac,"Friendly owners, fun thematic decor, and lots ...",5


In [None]:
# Split the review_Hawaii_feature into train (90%) and test (10%) datasets
trainData, testData = train_test_split(review_Hawaii_feature, test_size=0.1, random_state=42)

# Optionally, you can reset indices of both DataFrames if needed
trainData.reset_index(drop=True, inplace=True)
testData.reset_index(drop=True, inplace=True)

### 1. sim - user_id v.s. gmap_id predict rating

In [137]:
usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
# itemNames = {}
ratingDict = {} # To retrieve a rating for a specific user/item pair

for _, d in trainData.iterrows():
    user,item = d['user_id'], d['gmap_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    ratingDict[(user,item)] = d['rating']
    # itemNames[item] = d['product_title']

In [138]:
def MSE(y_true, y_pred):
    differences = [(x-y)**2 for x,y in zip(y_true,y_pred)]
    return sum(differences) / len(differences)

In [139]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [140]:
def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom

In [141]:
ratingMean = sum([d['rating'] for _, d in trainData.iterrows()]) / len(trainData)

In [142]:
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [143]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['gmap_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        ratingPrediction = itemAverages[item] + sum(weightedRatings) / sum(similarities)
        return max(1, min(5, ratingPrediction))
    else:
        # User hasn't rated any similar items
        return ratingMean

In [144]:
u, i = testData.iloc[1]['user_id'], testData.iloc[1]['gmap_id']
predictRating(u, i)

5

In [145]:
# for _, d in testData.iterrows():
#     u, i = d['user_id'], d['gmap_id']
#     predictRating(u, i)
#     d['rating']

In [146]:
simPredictions = [predictRating(d['user_id'], d['gmap_id']) for _, d in testData.iterrows()]

In [147]:
true_rating = [d['rating'] for _, d in testData.iterrows()]
MSE(simPredictions, true_rating)

0.7860738293802693

### 2. text mining - text predict rating

In [148]:
# Text Mining