In [1]:
import json
import gzip
from collections import defaultdict
import model

In [2]:
reviews = []

for file in ["Data/review-SanDiego_10_file1.json.gz", "Data/review-SanDiego_10_file2.json.gz", 
             "Data/review-SanDiego_10_file3.json.gz", "Data/review-SanDiego_10_file4.json.gz"]:
    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for line in f:
            reviews.append(json.loads(line))

meta = {}

with gzip.open("Data/meta-SanDiego.json.gz", 'rt', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        meta[business['gmap_id']] = business

In [3]:
reviews_valid, meta_valid = model.create_validation_set(reviews, meta, val_size=1000)

In [4]:
pos, neg  = model.generate_samples(reviews_valid, meta_valid)

pos, neg

([('102919413961325598675', '0x80dc74e6dc658689:0xcf50c0b390e567b0'),
  ('102919413961325598675', '0x80dc7588a897ed91:0x190ac94f6ebb8c76'),
  ('102919413961325598675', '0x80dbf9816722ece1:0xb1d8be908ba2136a'),
  ('111563416890720790125', '0x80dc742efd23ae27:0x94afbc015ffd9ee4'),
  ('114180658218860354409', '0x80d95443306b700b:0x8179a3cf69bb34'),
  ('114180658218860354409', '0x80d95443011ea917:0x9b36271cc218e2c1'),
  ('114180658218860354409', '0x80deaa0d3006e687:0x2a4cd6f62cb364f9'),
  ('114180658218860354409', '0x80d955111f988f03:0xc10e1dfc72418890'),
  ('114180658218860354409', '0x80d9535f6919f2c9:0x6bdc09d49a437f66'),
  ('114180658218860354409', '0x80deaa6c2fbf09c1:0xcba3213e23bf5f73'),
  ('114180658218860354409', '0x80d954fa348bcbc7:0x3fb6c6ffd57fa669'),
  ('114180658218860354409', '0x80d957112bdf5145:0xa73d3f816eda78c9'),
  ('117732534023563110456', '0x80dbff84f1f812dd:0xc7686efaa1bba2fd'),
  ('117732534023563110456', '0x80dc00d32cd783f5:0x8bdee39ec9bf5d98'),
  ('117732534023563110

In [5]:
train_samples, test_samples = model.train_test_split(pos, neg, test_size=0.2)

In [11]:
sorted_businesses = model.baseline_model(train_samples)

In [7]:
total_revs = sum(count for count, business_id in sorted_businesses)
total_revs

4111

In [12]:
baseline_preds = model.baseline_predict(sorted_businesses, total_revs)

In [9]:
results = model.evaluate_model(test_samples, baseline_preds)
results

{'accuracy': 0.48765133171912833,
 'precision': 0.4926739926739927,
 'recall': 0.2561904761904762,
 'f1_score': 0.33709273182957394}