In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
import json
from nltk.stem import PorterStemmer
from sklearn import linear_model
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb

In [2]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [3]:
users_reviews = [l for l in parse("review-Hawaii_10.json.gz")]
businesses = [l for l in parse("meta-Hawaii.json.gz")]

### Feature Engineering

In [4]:
#user avg rating
user_ratings = {}

for review in users_reviews:
    user_id = review.get('user_id')
    rating = review.get('rating')

    if user_id is not None and rating is not None:
        if user_id not in user_ratings:
            user_ratings[user_id] = {'total_rating': 0, 'count': 0}
        
        user_ratings[user_id]['total_rating'] += rating
        user_ratings[user_id]['count'] += 1

user_avg_ratings = {user_id: data['total_rating'] / data['count'] 
                    for user_id, data in user_ratings.items() if data['count'] > 0}

for user_id in list(user_avg_ratings.keys())[:5]:
    print(f"User ID: {user_id}, Average Rating: {user_avg_ratings[user_id]:.2f}")

User ID: 113965417079576625433, Average Rating: 4.91
User ID: 116655819137293331166, Average Rating: 4.73
User ID: 100834119994550070853, Average Rating: 4.94
User ID: 103207214144482097315, Average Rating: 4.64
User ID: 108526171163172578599, Average Rating: 3.48


In [5]:
#businuess img count
image_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    images = review.get('pics')

    if gmap_id and images:
        num_images = len(images)
        if gmap_id in image_counts:
            image_counts[gmap_id] += num_images
        else:
            image_counts[gmap_id] = num_images

In [6]:
response_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    response = review.get('resp')

    if gmap_id and response:
        if gmap_id in response_counts:
            response_counts[gmap_id] += 1
        else:
            response_counts[gmap_id] = 1

In [7]:
users_data = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'rating': d['rating'],'text': d['text']} 
              for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

business_dict = {d['gmap_id']: {'avg_rating': d.get('avg_rating', 0), 
                                'num_of_reviews': d.get('num_of_reviews', 0)}
                 for d in businesses if 'gmap_id' in d}

In [8]:
features = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'text': d['text'], 
             'user_avg_rating': user_avg_ratings[d['user_id']], 
             'response_count': response_counts.get(d['gmap_id'], 0), 
             'image_counts': image_counts.get(d['gmap_id'], 0),
             'bus_avg_rating': business_dict[d['gmap_id']]['avg_rating'], 
             'num_of_reviews': business_dict[d['gmap_id']]['num_of_reviews']}
            for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]
labels = [d['rating'] for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]
    

In [9]:
random.seed(12345)
combined = list(zip(features, labels))
random.shuffle(combined)
features[:], labels[:] = zip(*combined)

In [10]:
features[0]

{'user_id': '118019878707665188743',
 'gmap_id': '0x7eaad5b76010bf61:0xd3730662460d3b4f',
 'text': 'Absolutely Beautiful! Stunning one of a kind views and Amazing eats! Enjoy!',
 'user_avg_rating': 5.0,
 'response_count': 3,
 'image_counts': 24,
 'bus_avg_rating': 4.2,
 'num_of_reviews': 326}

In [11]:
def MSE(predictions, labels):
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [12]:
def MAE(predictions, labels):
    differences = [abs(x - y) for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [13]:
# split dataset
split_index = int(len(features) * 0.9)
train_features = features[:split_index]
test_features = features[split_index:]
train_labels = labels[:split_index]
test_labels = labels[split_index:]

### Bag of Words

In [14]:
# lowercase, punctuation removed, no stemming
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stop_words = set(stopwords.words('english'))

for d in train_features:
    if d['text']:
        r = ''.join([c for c in d['text'].lower() if not c in punctuation])
        ws = [w for w in r.split() if w not in stop_words]
        for w in ws:
            wordCount[w] += 1
            
len(wordCount)

314214

In [15]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort(reverse=True)
words = [x[1] for x in counts[:1000]]

#### Sentiment analysis

In [16]:
wordID = dict(zip(words, range(len(words))))
wordSet = set(words)

In [17]:
def feature(datum):
    feat = [0]*len(words)
    if datum['text']:
        r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
        ws = [w for w in r.split() if w not in stop_words]
        for w in ws:
            if w in words:
                feat[wordID[w]] += 1
    feat.append(1) #offset
    return feat

In [18]:
train_X = [feature(d) for d in train_features]
train_y = train_labels
test_X = [feature(d) for d in test_features]
test_y = test_labels

In [19]:
# regularized regression
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(train_X, train_y)
theta = clf.coef_
train_predictions = clf.predict(train_X)
test_predictions = clf.predict(test_X)

In [20]:
train_mse = MSE(train_predictions, train_y)
test_mse = MSE(test_predictions, test_y)
print(f'train mse: {train_mse}')
print(f'test mse: {test_mse}')

train mse: 0.700975159049973
test mse: 0.702234452610242


In [21]:
train_mae = MAE(train_predictions, train_y)
test_mae = MAE(test_predictions, test_y)
print(f'train mae: {train_mae}')
print(f'test mae: {test_mae}')

train mae: 0.6593291216053075
test mae: 0.6587721013810098


In [40]:
wordSort = list(zip(theta[:-1], words))
wordSort.sort()

In [43]:
sentimentDict = dict(zip(words, theta[:-1]))

In [44]:
sentimentDict['great']

0.2248446743831719

In [45]:
def sentiment(datum):
    sentimentScore = 0
    if datum['text']:
        r = ''.join([c for c in datum['text'].lower() if not c in punctuation])
        for w in r.split():
            sentimentScore += sentimentDict.get(w, 0)
    return sentimentScore
    

## GradientBoostingRegressor

In [46]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews'], sentiment(d)] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews'], sentiment(d)] for d in test_features]

train_y = train_labels
test_y = test_labels

In [47]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=1, random_state=42)

gb_regressor.fit(train_X, train_y)

predictions = gb_regressor.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')


MSE: 0.46866556114440194


In [48]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'], sentiment(d)] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'], sentiment(d)] for d in test_features]

train_y = train_labels
test_y = test_labels

In [49]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=1, random_state=42)

gb_regressor.fit(train_X, train_y)

predictions = gb_regressor.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')


MSE: 0.4685015124463457


## Xgboost

In [50]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'], sentiment(d)] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'], sentiment(d)] for d in test_features]

train_y = train_labels
test_y = test_labels

In [51]:
xg_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)
xg_model.fit(train_X, train_y)
predictions = xg_model.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')

MSE: 0.4659466604257726
