In [28]:
import gzip
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import json
import pandas as pd

In [29]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [30]:
users_reviews = []
for l in parse("review-Hawaii_10.json.gz"):
  users_reviews.append(l)



In [31]:
print(f"users_reviews dataset length: {len(users_reviews)}")
print("users_reviews dataset example:")
users_reviews[0]

users_reviews dataset length: 1504347
users_reviews dataset example:


{'user_id': '113965417079576625433',
 'name': 'manuel grimaldo',
 'time': 1591839903487,
 'rating': 5,
 'text': 'Great new upgrade',
 'pics': None,
 'resp': None,
 'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e'}

In [32]:
businesses = []
for l in parse("meta-Hawaii.json.gz"):
  businesses.append(l)

In [33]:
print(f"businesses dataset length: {len(businesses)}")
print("businesses dataset example:")
businesses[0]

businesses dataset length: 21507
businesses dataset example:


{'name': 'Hale Pops',
 'address': 'Hale Pops, 55-370 Kamehameha Hwy, Laie, HI 96762',
 'gmap_id': '0x7c00456eecad3111:0x8217f9600c51f33',
 'description': None,
 'latitude': 21.637795699999998,
 'longitude': -157.9207142,
 'category': ['Restaurant'],
 'avg_rating': 4.4,
 'num_of_reviews': 18,
 'price': None,
 'hours': [['Thursday', '11AM–8PM'],
  ['Friday', '11AM–8PM'],
  ['Saturday', '11AM–8PM'],
  ['Sunday', 'Closed'],
  ['Monday', '11AM–8PM'],
  ['Tuesday', '11AM–8PM'],
  ['Wednesday', '11AM–8PM']],
 'MISC': {'Service options': ['Outdoor seating', 'Takeout', 'Delivery'],
  'Popular for': ['Lunch', 'Solo dining'],
  'Accessibility': ['Wheelchair accessible entrance'],
  'Offerings': ['Comfort food', 'Quick bite'],
  'Amenities': ['Good for kids'],
  'Atmosphere': ['Casual'],
  'Crowd': ['Groups', 'Tourists'],
  'Payments': ['NFC mobile payments']},
 'state': 'Closed ⋅ Opens 11AM',
 'relative_results': ['0x7c00451360f80cf1:0x930291a38bab3132',
  '0x7c00457322571d57:0xe3974e89bbfc41ce',

In [34]:
business_user = defaultdict(list)
for d in users_reviews:
    business_user[d['gmap_id']].append(d['user_id'])

# print number of business
print(f"number of business: {len(business_user)}")

number of business: 11686


In [35]:
# extract all business id from businesses
business_id = []
for d in businesses:
    business_id.append(d['gmap_id'])

In [36]:
# check if every business in users dataset is in business dataset
for b in business_user:
    if b not in business_id:
        print(f"business {b} not in business dataset")

In [49]:
#user avg rating
user_ratings = {}

for review in users_reviews:
    user_id = review.get('user_id')
    rating = review.get('rating')

    if user_id is not None and rating is not None:
        if user_id not in user_ratings:
            user_ratings[user_id] = {'total_rating': 0, 'count': 0}
        
        user_ratings[user_id]['total_rating'] += rating
        user_ratings[user_id]['count'] += 1

user_avg_ratings = {user_id: data['total_rating'] / data['count'] 
                    for user_id, data in user_ratings.items() if data['count'] > 0}

for user_id in list(user_avg_ratings.keys())[:5]:
    print(f"User ID: {user_id}, Average Rating: {user_avg_ratings[user_id]:.2f}")


User ID: 113965417079576625433, Average Rating: 4.91
User ID: 116655819137293331166, Average Rating: 4.73
User ID: 100834119994550070853, Average Rating: 4.94
User ID: 103207214144482097315, Average Rating: 4.64
User ID: 108526171163172578599, Average Rating: 3.48


In [45]:
users_data = [{'user_id': d['user_id'], 'gmap_id': d['gmap_id'], 'rating': d['rating'],'text': d['text']} 
              for d in users_reviews if 'user_id' in d and 'gmap_id' in d and 'rating' in d and 'text' in d]

business_dict = {d['gmap_id']: {'avg_rating': d.get('avg_rating', 0), 
                                'num_of_reviews': d.get('num_of_reviews', 0)}
                 for d in businesses if 'gmap_id' in d}

dataset = []
for user_review in users_data:
    gmap_id = user_review['gmap_id']
    if gmap_id in business_dict:
        merged_data = {**user_review, **business_dict[gmap_id]}
        dataset.append(merged_data)

In [53]:
#add user_avg_rating to dataset
for entry in dataset:
    user_id = entry.get('user_id')
    if user_id in user_avg_ratings:
        entry['user_avg_rating'] = user_avg_ratings[user_id]
    else:
        entry['user_avg_rating'] = None

In [52]:
dataset

[{'user_id': '113965417079576625433',
  'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e',
  'rating': 5,
  'text': 'Great new upgrade',
  'avg_rating': 4.1,
  'num_of_reviews': 18,
  'user_avg_rating': 4.909090909090909},
 {'user_id': '116655819137293331166',
  'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e',
  'rating': 5,
  'text': None,
  'avg_rating': 4.1,
  'num_of_reviews': 18,
  'user_avg_rating': 4.7272727272727275},
 {'user_id': '100834119994550070853',
  'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e',
  'rating': 5,
  'text': None,
  'avg_rating': 4.1,
  'num_of_reviews': 18,
  'user_avg_rating': 4.9411764705882355},
 {'user_id': '103207214144482097315',
  'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e',
  'rating': 5,
  'text': None,
  'avg_rating': 4.1,
  'num_of_reviews': 18,
  'user_avg_rating': 4.642857142857143},
 {'user_id': '108526171163172578599',
  'gmap_id': '0x7c00159b5b1b1d25:0x8d2d85d4a758290e',
  'rating': 3,
  'text': None,
  'avg_rating': 4.1,
  '