In [1]:
import json
import gzip
from collections import defaultdict
import model

In [2]:
reviews = []

for file in ["Data/review-SanDiego_10_file1.json.gz", "Data/review-SanDiego_10_file2.json.gz", 
             "Data/review-SanDiego_10_file3.json.gz", "Data/review-SanDiego_10_file4.json.gz"]:
    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for line in f:
            reviews.append(json.loads(line))

meta = {}

with gzip.open("Data/meta-SanDiego.json.gz", 'rt', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        meta[business['gmap_id']] = business

In [3]:
reviews_valid, meta_valid = model.create_validation_set(reviews, meta, val_size=1000)

In [4]:
pos, neg  = model.generate_samples(reviews_valid, meta_valid)

pos, neg

([('109072880444157001594', '0x80deaae66a57ce09:0xa4b7d84047ea47c6'),
  ('109072880444157001594', '0x80dbf4f2df4676e1:0xae09fc3ae5cd0e9f'),
  ('109072880444157001594', '0x80dc0b7ddd14e7e1:0x2cfc2dfe73527e91'),
  ('109072880444157001594', '0x80d9544957270c53:0xa686588cc04aa86e'),
  ('109072880444157001594', '0x80dc7550cd1b70dd:0x171db8ee3b08bf78'),
  ('118442983960814447725', '0x80dbfa53f7cfcf25:0xe4f080d41345780d'),
  ('118442983960814447725', '0x80dbf0ccfffd09c7:0xd70882c34b673ecc'),
  ('118442983960814447725', '0x80dbfb029ff5325b:0x9b68da4eb6255026'),
  ('118442983960814447725', '0x80dbfa4911addcf1:0x8a34b70181788ed6'),
  ('118442983960814447725', '0x80dbfbb75540fe01:0x653e2d7c44e7419b'),
  ('118442983960814447725', '0x80dbf91dd5aef0f9:0x4a85d75c5555ecee'),
  ('110758441263482634614', '0x80d956bb7de4da19:0x7fa17a13fe4cc392'),
  ('110758441263482634614', '0x80d9536c99dcb5f9:0x1f96508e00fdf956'),
  ('110758441263482634614', '0x80d95368d152ed6f:0xf513aa5799587cb9'),
  ('1107584412634826

In [5]:
train_samples, test_samples = model.train_test_split(pos, neg, test_size=0.2)

In [6]:
sorted_businesses = model.baseline_model(train_samples)

In [7]:
total_revs = sum(count for count, business_id in sorted_businesses)
total_revs

4237

In [8]:
baseline_preds = model.baseline_predict(sorted_businesses, total_revs)
baseline_preds

{'0x80c34d37ddbda5a3:0x6e3676994dd3d964',
 '0x80d945113d04cf6b:0xc5e0029031a73e16',
 '0x80d945140cd99c2d:0x3e0b8349dd462b48',
 '0x80d945554d95832b:0xd07f9f2240ff0bf6',
 '0x80d945697e456b61:0x15b1a6717370956',
 '0x80d9456b4c9cb8b3:0xc7cc87b476af8e20',
 '0x80d9456c25eb41c3:0x933f3348ec07d34a',
 '0x80d94599398a629b:0x258b7a1bfe220a3',
 '0x80d94599bf53895b:0xf1e089c954c8122',
 '0x80d945a492505c6f:0xdbc398a0717fea3d',
 '0x80d946546b07a1eb:0x39b9a66ae3c10619',
 '0x80d947008e26d763:0x9a54fdcb42402aaf',
 '0x80d949032204517b:0xf45274f72060fb28',
 '0x80d94904bedfb033:0xd23456d7acc183cb',
 '0x80d949105ccf1611:0x2286841c42441d6c',
 '0x80d949170eccf4b9:0xabf8c1d0c36389fb',
 '0x80d949186a4d0891:0x99726399a28bb1fc',
 '0x80d9491ab03aad91:0xc98a20c23a513ff5',
 '0x80d9491ae430911f:0x8f264113429b4d58',
 '0x80d9491b162da4d9:0x9429f0ea72843ec1',
 '0x80d9491b2dfaf271:0xf3df331aedaa60c8',
 '0x80d9493f36fe9b7b:0x17efd252db607d4a',
 '0x80d9494cf5470263:0xa4bf571bec2fe39b',
 '0x80d9496587cdcd59:0xe790de98dc41ff

In [9]:
results = model.evaluate_model(test_samples, baseline_preds)
results

{'accuracy': 0.5061147695202258,
 'precision': 0.5258126195028681,
 'recall': 0.255338904363974,
 'f1_score': 0.34375}