In [1]:
import json
import gzip
from collections import defaultdict
import model

In [2]:
reviews = []

for file in ["Data/review-SanDiego_10_file1.json.gz", "Data/review-SanDiego_10_file2.json.gz", 
             "Data/review-SanDiego_10_file3.json.gz", "Data/review-SanDiego_10_file4.json.gz"]:
    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for line in f:
            reviews.append(json.loads(line))

meta = {}

with gzip.open("Data/meta-SanDiego.json.gz", 'rt', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        meta[business['gmap_id']] = business

In [3]:
reviews_valid, meta_valid = model.create_validation_set(reviews, meta, val_size=1000)

In [4]:
pos, neg  = model.generate_samples(reviews_valid, meta_valid)

pos, neg

([('117545163737989763945', '0x80dbf6ff21462c27:0xca468907bf08fb19'),
  ('117545163737989763945', '0x80dbf71fedc39153:0xea9c3510bf4a5318'),
  ('117545163737989763945', '0x80dbf6f8cdec0699:0xdaa21bfd599b4865'),
  ('117545163737989763945', '0x80dc08097258fa3b:0xa959c77d2da1e84c'),
  ('117545163737989763945', '0x80dbf71dab974a2d:0x445ee0784ccc48da'),
  ('117545163737989763945', '0x80dc089c417dbc61:0x58e07bfd27dccf95'),
  ('117545163737989763945', '0x80dc09bf78e8a8df:0x12e2f1c12447350e'),
  ('117545163737989763945', '0x80dbfb797d106457:0xa8d8f7e808efc4f9'),
  ('117545163737989763945', '0x80dbf70174f2ece5:0xb916e5906e8dbe75'),
  ('117545163737989763945', '0x80dbfa0bb8c938a1:0xde1f7c8c20ee98aa'),
  ('117545163737989763945', '0x80dbf71e49b7377d:0xc23985b6dbc556d8'),
  ('117545163737989763945', '0x80dbf6f8b185c341:0x59c58efad9fc4c41'),
  ('117545163737989763945', '0x80dbfeb2ebedafbd:0x89f21028c7fd4a11'),
  ('110019310122628612590', '0x80d954f9bd29e99b:0xcc5d097c01e34f5e'),
  ('1100193101226286

In [5]:
train_samples, test_samples = model.train_test_split(pos, neg, test_size=0.2)

In [6]:
sorted_businesses = model.baseline_model(train_samples)

In [7]:
total_revs = sum(count for count, business_id in sorted_businesses)
total_revs

4482

In [9]:
baseline_preds = model.baseline_predict(sorted_businesses, total_revs)
baseline_preds

{'0x14e3945d3c17643f:0x8fa3e9b264c0499f',
 '0x80d945469ac54e83:0x6efd1327bedc5e8',
 '0x80d9456c25eb41c3:0x933f3348ec07d34a',
 '0x80d9458460790555:0xcc8a65ccc4ac1fed',
 '0x80d9459752c948e9:0x8d13dc8d9b929acc',
 '0x80d94655112bbf71:0x7f80772227c8371e',
 '0x80d947006e133bfb:0x49624d6b4c4bc868',
 '0x80d948afd4ff2457:0xda8c9780419ebaf2',
 '0x80d948b472aa6959:0xc7a4ab5905352013',
 '0x80d949032204517b:0xf45274f72060fb28',
 '0x80d949105ccf1611:0x2286841c42441d6c',
 '0x80d94919efdcaf3b:0xeb2fd55e8ab4c71f',
 '0x80d9491a8fbdfd37:0x46330c8512c5c157',
 '0x80d9491ab03aad91:0xc98a20c23a513ff5',
 '0x80d9491ae4309a9f:0xed994bee4d4a1ee5',
 '0x80d9491ca6f809ed:0x18a3f179d5a7da98',
 '0x80d94924b32b0c13:0x4c1bd9c806be0d7e',
 '0x80d9493b5a02e613:0x5044d2090cba9c83',
 '0x80d94ba1c5f6d7ad:0x80228e4e5482def3',
 '0x80d94c10c4af2b6b:0x57c0893aa992b6aa',
 '0x80d94c18c246ae33:0xcd54f52d06fad88e',
 '0x80d94c1cacf0fb2f:0x7c8e162bad15093e',
 '0x80d94c229bbb1cd3:0xae528744f7d32b45',
 '0x80d94c2a86224f13:0xe2a3e67a1b6f

In [10]:
results = model.evaluate_model(test_samples, baseline_preds)
results

{'accuracy': 0.5082258781680747,
 'precision': 0.5315985130111525,
 'recall': 0.25087719298245614,
 'f1_score': 0.34088200238379024}