In [36]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import json
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords


In [3]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [4]:
users_reviews = []
for l in parse("review-Hawaii_10.json.gz"):
  users_reviews.append(l)

In [5]:
businesses = []
for l in parse("meta-Hawaii.json.gz"):
  businesses.append(l)

In [6]:
def MSE(predictions, labels):
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [8]:
#user avg rating
user_ratings = {}

for review in users_reviews:
    user_id = review.get('user_id')
    rating = review.get('rating')

    if user_id is not None and rating is not None:
        if user_id not in user_ratings:
            user_ratings[user_id] = {'total_rating': 0, 'count': 0}
        
        user_ratings[user_id]['total_rating'] += rating
        user_ratings[user_id]['count'] += 1

user_avg_ratings = {user_id: data['total_rating'] / data['count'] 
                    for user_id, data in user_ratings.items() if data['count'] > 0}

for user_id in list(user_avg_ratings.keys())[:5]:
    print(f"User ID: {user_id}, Average Rating: {user_avg_ratings[user_id]:.2f}")

User ID: 113965417079576625433, Average Rating: 4.91
User ID: 116655819137293331166, Average Rating: 4.73
User ID: 100834119994550070853, Average Rating: 4.94
User ID: 103207214144482097315, Average Rating: 4.64
User ID: 108526171163172578599, Average Rating: 3.48


In [9]:
#businuess img count
image_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    images = review.get('pics')

    if gmap_id and images:
        num_images = len(images)
        if gmap_id in image_counts:
            image_counts[gmap_id] += num_images
        else:
            image_counts[gmap_id] = num_images

In [10]:
response_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    response = review.get('resp')

    if gmap_id and response:
        if gmap_id in response_counts:
            response_counts[gmap_id] += 1
        else:
            response_counts[gmap_id] = 1

In [11]:
users_data = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'rating': d['rating'],'text': d['text']} 
              for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

business_dict = {d['gmap_id']: {'avg_rating': d.get('avg_rating', 0), 
                                'num_of_reviews': d.get('num_of_reviews', 0)}
                 for d in businesses if 'gmap_id' in d}

dataset = []
for user_review in users_data:
    gmap_id = user_review['gmap_id']
    if gmap_id in business_dict:
        merged_data = {**user_review, **business_dict[gmap_id]}
        dataset.append(merged_data)

In [12]:
#add features
for entry in dataset:
    gmap_id = entry.get('gmap_id')
    user_id = entry.get('user_id')
    entry['user_avg_rating'] = user_avg_ratings[user_id]
    entry['response_count'] = response_counts.get(gmap_id,0)
    entry['image_counts'] = image_counts.get(gmap_id, 0)
    

In [28]:
# split dataset
train_data = dataset[:int(len(dataset) * 0.9)]
test_data = dataset[int(len(dataset) * 0.9):]
train_users_reviews = users_reviews[:int(len(users_reviews) * 0.9)]
test_users_reviews = users_reviews[int(len(users_reviews) * 0.9):]

In [38]:
import nltk

In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/bensonjian/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [40]:
# lowercase, punctuation removed, no stemming
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
for d in train_data:
    if d['text']:
        r = ''.join([c for c in d['text'].lower() if not c in punctuation])
        ws = [w for w in r.split() if w not in stop_words]
        for w in ws:
            # w = stemmer.stem(w)
            wordCount[w] += 1
len(wordCount)

311649

In [41]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort(reverse=True)
words = [x[1] for x in counts[:1000]]

In [42]:
words[:100]

['great',
 'good',
 'food',
 'place',
 'service',
 'nice',
 'original',
 'google',
 'translated',
 'best',
 'friendly',
 'staff',
 'get',
 'go',
 'time',
 'like',
 'one',
 'beach',
 'beautiful',
 'delicious',
 'love',
 'always',
 'also',
 'really',
 'parking',
 'amazing',
 'awesome',
 'little',
 'people',
 'well',
 'back',
 'would',
 'prices',
 'store',
 'excellent',
 'island',
 'see',
 'clean',
 'dont',
 'got',
 'lot',
 'location',
 'even',
 'worth',
 'price',
 'eat',
 'much',
 'view',
 'local',
 'small',
 'restaurant',
 'fun',
 'coffee',
 'experience',
 'day',
 'lots',
 'pretty',
 'fresh',
 'area',
 'us',
 'recommend',
 'many',
 'everything',
 'take',
 'went',
 'better',
 'wait',
 'hawaii',
 'fish',
 'order',
 'make',
 'definitely',
 'way',
 'selection',
 'shop',
 'fast',
 'around',
 'water',
 'need',
 'come',
 'super',
 'right',
 'helpful',
 'chicken',
 'try',
 'park',
 'big',
 'visit',
 'views',
 'bit',
 'find',
 'menu',
 'spot',
 'want',
 'first',
 'long',
 'atmosphere',
 'drinks'

In [43]:
wordID = dict(zip(words, range(len(words))))
wordSet = set(words)

In [57]:
def feature(datum):
    feat = [0]*len(words)
    if datum['text']:
        r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
        ws = [w for w in r.split() if w not in stop_words]
        for w in ws:
            if w in words:
                feat[wordID[w]] += 1
    feat.append(1) #offset
    return feat

In [58]:
train_X = [feature(d) for d in train_data]
train_y = [u['rating'] for u in train_users_reviews]
test_X = [feature(d) for d in test_data]
test_y = [u['rating'] for u in test_users_reviews]

In [59]:
# regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(train_X, train_y)
theta = clf.coef_
train_predictions = clf.predict(train_X)
test_predictions = clf.predict(test_X)

In [60]:
train_mse = MSE(train_predictions, train_y)
test_mse = MSE(test_predictions, test_y)
print(f'train mse: {train_mse}')
print(f'test mse: {test_mse}')

train mse: 0.7099192239329473
test mse: 0.6224833976559787


In [61]:
wordSort = list(zip(theta, words))
wordSort.sort()

In [62]:
wordSort[:100]

[(-1.327276586668271, 'worst'),
 (-1.1900794552149985, 'horrible'),
 (-1.1091567561356912, 'terrible'),
 (-1.105245298123043, 'rude'),
 (-1.084665176556934, 'poor'),
 (-0.9498497118404947, 'overpriced'),
 (-0.838752728986792, 'dirty'),
 (-0.7252086338199372, 'homeless'),
 (-0.7175400282410713, 'slow'),
 (-0.5421720821710071, 'ok'),
 (-0.5116224837929986, 'sad'),
 (-0.49177798928451977, 'expensive'),
 (-0.47869827880830723, 'average'),
 (-0.46923581532118736, 'okay'),
 (-0.46166266619972285, 'closed'),
 (-0.40780496865091137, 'priced'),
 (-0.40114313289268966, 'nothing'),
 (-0.4004991815248283, 'dry'),
 (-0.38043864858591114, 'bad'),
 (-0.37642970981428686, 'sorry'),
 (-0.36285863860615264, 'waited'),
 (-0.36106212436378265, 'salty'),
 (-0.3452652029334908, 'disappointed'),
 (-0.3094833250172318, 'empty'),
 (-0.3064028190804877, 'needs'),
 (-0.291613708838876, 'money'),
 (-0.29041736393532014, 'tourist'),
 (-0.28953233755898594, 'pricey'),
 (-0.28097528016852086, 'loud'),
 (-0.277822811