In [57]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
import numpy as np
import string
import random
import json
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from textblob import TextBlob
import nltk


In [58]:
nltk.download('vader_lexicon')


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhengz/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [59]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [60]:
users_reviews = [l for l in parse("review-Hawaii_10.json.gz")]
businesses = [l for l in parse("meta-Hawaii.json.gz")]

### Feature Engineering

In [61]:
#user avg rating
user_ratings = {}

for review in users_reviews:
    user_id = review.get('user_id')
    rating = review.get('rating')

    if user_id is not None and rating is not None:
        if user_id not in user_ratings:
            user_ratings[user_id] = {'total_rating': 0, 'count': 0}
        
        user_ratings[user_id]['total_rating'] += rating
        user_ratings[user_id]['count'] += 1

user_avg_ratings = {user_id: data['total_rating'] / data['count'] 
                    for user_id, data in user_ratings.items() if data['count'] > 0}

for user_id in list(user_avg_ratings.keys())[:5]:
    print(f"User ID: {user_id}, Average Rating: {user_avg_ratings[user_id]:.2f}")

User ID: 113965417079576625433, Average Rating: 4.91
User ID: 116655819137293331166, Average Rating: 4.73
User ID: 100834119994550070853, Average Rating: 4.94
User ID: 103207214144482097315, Average Rating: 4.64
User ID: 108526171163172578599, Average Rating: 3.48


In [62]:
#businuess img count
image_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    images = review.get('pics')

    if gmap_id and images:
        num_images = len(images)
        if gmap_id in image_counts:
            image_counts[gmap_id] += num_images
        else:
            image_counts[gmap_id] = num_images

In [63]:
response_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    response = review.get('resp')

    if gmap_id and response:
        if gmap_id in response_counts:
            response_counts[gmap_id] += 1
        else:
            response_counts[gmap_id] = 1

In [71]:
users_data = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'rating': d['rating'],'text': d['text']} 
              for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

business_dict = {d['gmap_id']: {'avg_rating': d.get('avg_rating', 0), 
                                'num_of_reviews': d.get('num_of_reviews', 0)}
                 for d in businesses if 'gmap_id' in d}

#### Sentiment analysis

In [65]:
from nltk.sentiment import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    if text is None:
        return 0  # Return a neutral sentiment score for missing text
    sentiment_scores = sid.polarity_scores(text)
    return sentiment_scores['compound']

In [66]:
from textblob import TextBlob

def get_textblob_score(text):
    if text is None:
        return 0  # Return a neutral sentiment score for missing text
    testimonial = TextBlob(text)
    return testimonial.sentiment.polarity

In [105]:
features = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 
             'user_avg_rating': user_avg_ratings[d['user_id']], 
             'response_count': response_counts.get(d['gmap_id'], 0), 
             'image_counts': image_counts.get(d['gmap_id'], 0),
             'bus_avg_rating': business_dict[d['gmap_id']]['avg_rating'], 
             'num_of_reviews': business_dict[d['gmap_id']]['num_of_reviews'],
             'sentiment_score': get_sentiment_score(d['text'])}  # Add sentiment score here
            for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

labels = [d['rating'] for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

In [106]:
features[0]

{'user_id': '113965417079576625433',
 'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e',
 'user_avg_rating': 4.909090909090909,
 'response_count': 0,
 'image_counts': 0,
 'bus_avg_rating': 4.1,
 'num_of_reviews': 18,
 'sentiment_score': 0.6249}

In [107]:
random.seed(12345)
combined = list(zip(features, labels))
random.shuffle(combined)
features[:], labels[:] = zip(*combined)

In [108]:
split_index = int(len(features) * 0.9)
train_features = features[:split_index]
test_features = features[split_index:]
train_labels = labels[:split_index]
test_labels = labels[split_index:]

### GradientBoostingRegressor

In [109]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in test_features]

train_y = train_labels
test_y = test_labels

In [110]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=1, random_state=42)

gb_regressor.fit(train_X, train_y)

predictions = gb_regressor.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')


MSE: 0.492730813318984


### Xgboost

In [111]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in test_features]

train_y = train_labels
test_y = test_labels

In [112]:
xg_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, random_state=42)
xg_model.fit(train_X, train_y)
test_predictions = xg_model.predict(test_X)

train_predictions = xg_model.predict(train_X)


test_mse = mean_squared_error(test_y, test_predictions)
train_mse = mean_squared_error(train_y, train_predictions)

print(f'Train MSE: {train_mse}')
print(f'Test MSE: {test_mse}')

Train MSE: 0.48445039205570006
Test MSE: 0.4930644673293598


In [113]:
random_indices = random.sample(range(len(test_X)), 5)

sample_test_X = [test_X[i] for i in random_indices]
sample_test_y = [test_y[i] for i in random_indices]

sample_predictions = xg_model.predict(sample_test_X)

for i, prediction in enumerate(sample_predictions):
    print(f"User {random_indices[i]} - Predicted Rating: {prediction:.2f}, Actual Rating: {sample_test_y[i]}")



User 88290 - Predicted Rating: 4.30, Actual Rating: 3
User 119566 - Predicted Rating: 2.91, Actual Rating: 4
User 59088 - Predicted Rating: 3.62, Actual Rating: 5
User 109021 - Predicted Rating: 3.55, Actual Rating: 5
User 44880 - Predicted Rating: 4.54, Actual Rating: 5
