In [63]:
import gzip
from collections import defaultdict
import scipy.optimize
import numpy as np
import string
import random
import json
from nltk.stem import PorterStemmer
from sklearn import linear_model
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [64]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhengz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [65]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [66]:
users_reviews = [l for l in parse("review-Hawaii_10.json.gz")]
businesses = [l for l in parse("meta-Hawaii.json.gz")]

## Linear Regression

### Feature Engineering

In [67]:
#user avg rating
user_ratings = {}

for review in users_reviews:
    user_id = review.get('user_id')
    rating = review.get('rating')

    if user_id is not None and rating is not None:
        if user_id not in user_ratings:
            user_ratings[user_id] = {'total_rating': 0, 'count': 0}
        
        user_ratings[user_id]['total_rating'] += rating
        user_ratings[user_id]['count'] += 1

user_avg_ratings = {user_id: data['total_rating'] / data['count'] 
                    for user_id, data in user_ratings.items() if data['count'] > 0}

for user_id in list(user_avg_ratings.keys())[:5]:
    print(f"User ID: {user_id}, Average Rating: {user_avg_ratings[user_id]:.2f}")

User ID: 113965417079576625433, Average Rating: 4.91
User ID: 116655819137293331166, Average Rating: 4.73
User ID: 100834119994550070853, Average Rating: 4.94
User ID: 103207214144482097315, Average Rating: 4.64
User ID: 108526171163172578599, Average Rating: 3.48


In [68]:
#businuess img count
image_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    images = review.get('pics')

    if gmap_id and images:
        num_images = len(images)
        if gmap_id in image_counts:
            image_counts[gmap_id] += num_images
        else:
            image_counts[gmap_id] = num_images

In [69]:
response_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    response = review.get('resp')

    if gmap_id and response:
        if gmap_id in response_counts:
            response_counts[gmap_id] += 1
        else:
            response_counts[gmap_id] = 1

In [70]:
users_data = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'rating': d['rating'],'text': d['text']} 
              for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

business_dict = {d['gmap_id']: {'avg_rating': d.get('avg_rating', 0), 
                                'num_of_reviews': d.get('num_of_reviews', 0)}
                 for d in businesses if 'gmap_id' in d}

### Bag of Words

#### Sentiment analysis

In [71]:
from textblob import TextBlob

def get_sentiment_score(text):
    if text is None:
        return 0  # Return a neutral sentiment score for missing text
    testimonial = TextBlob(text)
    return testimonial.sentiment.polarity


In [77]:
features = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 
             'user_avg_rating': user_avg_ratings[d['user_id']], 
             'response_count': response_counts.get(d['gmap_id'], 0), 
             'image_counts': image_counts.get(d['gmap_id'], 0),
             'bus_avg_rating': business_dict[d['gmap_id']]['avg_rating'], 
             'num_of_reviews': business_dict[d['gmap_id']]['num_of_reviews'],
             'sentiment_score': get_sentiment_score(d['text'])}  # Add sentiment score here
            for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

labels = [d['rating'] for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

In [78]:
random.seed(12345)
combined = list(zip(features, labels))
random.shuffle(combined)
features[:], labels[:] = zip(*combined)

In [79]:
split_index = int(len(features) * 0.9)
train_features = features[:split_index]
test_features = features[split_index:]
train_labels = labels[:split_index]
test_labels = labels[split_index:]

In [80]:
train_features[0]

{'user_id': '118019878707665188743',
 'gmap_id': '0x7eaad5b76010bf61:0xd3730662460d3b4f',
 'user_avg_rating': 5.0,
 'response_count': 3,
 'image_counts': 24,
 'bus_avg_rating': 4.2,
 'num_of_reviews': 326,
 'sentiment_score': 0.67}

## Random Forest model

In [None]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)

train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews'],d['sentiment_score']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews'],d['sentiment_score']] for d in test_features]

train_y = train_labels
test_y = test_labels


# Train the model
rf.fit(train_X, train_y)

# Make predictions
predictions = rf.predict(test_X)

# Evaluate the model
mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')

## GradientBoostingRegressor

In [82]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in test_features]

train_y = train_labels
test_y = test_labels

In [83]:
gb_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=1, random_state=42)

gb_regressor.fit(train_X, train_y)

predictions = gb_regressor.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')


MSE: 0.4989093267226508


## Xgboost

In [85]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'],d['num_of_reviews'], d['image_counts'],d['sentiment_score']] for d in test_features]

train_y = train_labels
test_y = test_labels

In [87]:
xg_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=8, random_state=42)
xg_model.fit(train_X, train_y)
predictions = xg_model.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')

MSE: 0.49861906865976696


In [None]:
xg_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=7, random_state=42)

# Using GridSearchCV for hyperparameter tuning
parameters = {
    'n_estimators': [100],
    'learning_rate': [0.1],
    'max_depth': [7],
    'colsample_bytree': [0.5,0.6,0.7,0.8],
}

grid_search = GridSearchCV(estimator=xg_model, param_grid=parameters, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(train_X, train_y)

# Best parameters
print("Best parameters:", grid_search.best_params_)

best_params = grid_search.best_params_
xg_model.set_params(**best_params)
xg_model.fit(train_X, train_y)

# Predictions on the test set
predictions = xg_model.predict(test_X)

mse = mean_squared_error(test_y, predictions)
print(f'MSE: {mse}')

# Feature Importance
importance = xg_model.feature_importances_
print("Feature importances:", importance)


Best parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
MSE: 0.5673280302040854
Feature importances: [0.         0.7582232  0.21936944 0.00863487 0.01377245]
