In [1]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy as np
import string
import random
import string
import json
from nltk.stem import PorterStemmer
from sklearn import linear_model

In [2]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [3]:
users_reviews = [l for l in parse("review-Hawaii_10.json.gz")]
businesses = [l for l in parse("meta-Hawaii.json.gz")]

In [44]:
unique_categories = set()

for business in businesses:
    categories = business.get('category')    
    if categories:
        unique_categories.update(categories)

unique_categories_list = list(unique_categories)

print(unique_categories_list)


['Party equipment rental service', 'Tea exporter', 'Video production service', 'Business park', 'Country club', 'Screen printing supply store', 'Computer security service', 'Massage therapist', 'Electric vehicle charging station', 'Scout home', 'Grocery store', 'Insurance agency', 'Cattle farm', 'Meeting planning service', 'Electronics store', 'Garage door supplier', 'Transmission shop', 'Airbrushing service', 'Indian restaurant', 'Dessert restaurant', 'Local history museum', 'Aromatherapy service', 'Office services', 'Hostel', 'Alternative fuel station', 'Flea market', 'Make-up artist', 'Tunnel', 'Garden furniture shop', 'Glassware wholesaler', 'Alcoholism treatment program', 'Ceramics wholesaler', 'Heating equipment supplier', 'Literacy program', 'Psychic', 'Breakfast restaurant', 'Swimming instructor', 'Beautician', 'Office supply store', 'Herbalist', 'Meditation center', 'Parasailing ride service', 'House cleaning service', 'Pet store', 'Motor vehicle dealer', 'Nut store', 'Cattery

### Feature Engineering

In [4]:
#user avg rating
user_ratings = {}

for review in users_reviews:
    user_id = review.get('user_id')
    rating = review.get('rating')

    if user_id is not None and rating is not None:
        if user_id not in user_ratings:
            user_ratings[user_id] = {'total_rating': 0, 'count': 0}
        
        user_ratings[user_id]['total_rating'] += rating
        user_ratings[user_id]['count'] += 1

user_avg_ratings = {user_id: data['total_rating'] / data['count'] 
                    for user_id, data in user_ratings.items() if data['count'] > 0}

for user_id in list(user_avg_ratings.keys())[:5]:
    print(f"User ID: {user_id}, Average Rating: {user_avg_ratings[user_id]:.2f}")

User ID: 113965417079576625433, Average Rating: 4.91
User ID: 116655819137293331166, Average Rating: 4.73
User ID: 100834119994550070853, Average Rating: 4.94
User ID: 103207214144482097315, Average Rating: 4.64
User ID: 108526171163172578599, Average Rating: 3.48


In [5]:
#businuess img count
image_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    images = review.get('pics')

    if gmap_id and images:
        num_images = len(images)
        if gmap_id in image_counts:
            image_counts[gmap_id] += num_images
        else:
            image_counts[gmap_id] = num_images

In [6]:
response_counts = {}

for review in users_reviews:
    gmap_id = review.get('gmap_id')
    response = review.get('resp')

    if gmap_id and response:
        if gmap_id in response_counts:
            response_counts[gmap_id] += 1
        else:
            response_counts[gmap_id] = 1

In [7]:
users_data = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'rating': d['rating'],'text': d['text']} 
              for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

business_dict = {d['gmap_id']: {'avg_rating': d.get('avg_rating', 0), 
                                'num_of_reviews': d.get('num_of_reviews', 0)}
                 for d in businesses if 'gmap_id' in d}

In [8]:
features = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'text': d['text'], 
             'user_avg_rating': user_avg_ratings[d['user_id']], 
             'response_count': response_counts.get(d['gmap_id'], 0), 
             'image_counts': image_counts.get(d['gmap_id'], 0),
             'bus_avg_rating': business_dict[d['gmap_id']]['avg_rating'], 
             'num_of_reviews': business_dict[d['gmap_id']]['num_of_reviews']}
            for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]
labels = [d['rating'] for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]
    

In [9]:
random.seed(12345)
combined = list(zip(features, labels))
random.shuffle(combined)
features[:], labels[:] = zip(*combined)

## Linear Regression

In [10]:
features[0]

{'user_id': '118019878707665188743',
 'gmap_id': '0x7eaad5b76010bf61:0xd3730662460d3b4f',
 'text': 'Absolutely Beautiful! Stunning one of a kind views and Amazing eats! Enjoy!',
 'user_avg_rating': 5.0,
 'response_count': 3,
 'image_counts': 24,
 'bus_avg_rating': 4.2,
 'num_of_reviews': 326}

In [11]:
def MSE(predictions, labels):
    differences = [(x - y) ** 2 for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [12]:
def MAE(predictions, labels):
    differences = [abs(x - y) for x, y in zip(predictions, labels)]
    return sum(differences) / len(differences)

In [13]:
# split dataset
split_index = int(len(features) * 0.9)
train_features = features[:split_index]
test_features = features[split_index:]
train_labels = labels[:split_index]
test_labels = labels[split_index:]

### Baseline Model 1: avg_rating

In [14]:
# extract features
train_X = [[1, d['bus_avg_rating']] for d in train_features]
test_X = [[1, d['bus_avg_rating']] for d in test_features]
train_y = train_labels
test_y = test_labels

In [15]:
# train model
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [16]:
mse = MSE(predictions, test_y)
mse

0.7849375126098747

### Baseline Model 2: user_avg_rating

In [17]:
train_X = [[1, d['user_avg_rating']] for d in train_features]
test_X = [[1, d['user_avg_rating']] for d in test_features]
train_y = train_labels
test_y = test_labels

In [18]:
# train model
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [19]:
mse = MSE(predictions, test_y)
mse

0.6205286094204722

### Model 3: avg_rating, user_avg_rating

In [20]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating']] for d in test_features]
train_y = train_labels
test_y = test_labels

In [21]:
# train model
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [22]:
mse = MSE(predictions, test_y)
mse

0.5779212048040041

### Model 4: avg_rating, user_avg_rating, num_of_reviews

In [23]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews']] for d in test_features]
train_y = train_labels
test_y = test_labels

In [24]:
# train model
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [25]:
mse = MSE(predictions, test_y)
mse

0.5779095125296335

### Model 5

In [26]:
train_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews'], d['image_counts']] for d in train_features]
test_X = [[1, d['user_avg_rating'], d['bus_avg_rating'], d['num_of_reviews'], d['image_counts']] for d in test_features]
train_y = train_labels
test_y = test_labels

In [27]:
# train model
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(train_X, train_y)
predictions = model.predict(test_X)

In [28]:
mse = MSE(predictions, test_y)
mse

0.5779080405709329