In [1]:
import json
import gzip
from collections import defaultdict
import model

In [2]:
reviews = []

for file in ["Data/review-SanDiego_10_file1.json.gz", "Data/review-SanDiego_10_file2.json.gz", 
             "Data/review-SanDiego_10_file3.json.gz", "Data/review-SanDiego_10_file4.json.gz"]:
    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for line in f:
            reviews.append(json.loads(line))

meta = {}

with gzip.open("Data/meta-SanDiego.json.gz", 'rt', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        meta[business['gmap_id']] = business

In [3]:
reviews_valid, meta_valid = model.create_validation_set(reviews, meta, val_size=1000)

In [4]:
pos, neg  = model.generate_samples(reviews_valid, meta_valid)

pos, neg

([('117363266248892446964', '0x80d956c45741b69b:0xf3de1da10520495'),
  ('117363266248892446964', '0x80d9567ffdf3b981:0x93bb6be8f54a6134'),
  ('117363266248892446964', '0x80deaafb41844337:0x534b9d731a3eaa30'),
  ('117363266248892446964', '0x80d9554d74c40dc7:0xbbd7fee1c69df229'),
  ('117363266248892446964', '0x80d9566ffe107c77:0xabd64b7e0dad2198'),
  ('117363266248892446964', '0x80dc094c589ba2f3:0x56e3d02c99a0899e'),
  ('117363266248892446964', '0x80d954dce0489d89:0x1403001b95dc979d'),
  ('117363266248892446964', '0x80deaa0ebe751ce7:0x35fb05c30e4a4d39'),
  ('117363266248892446964', '0x80d95519a9fe69b3:0x609cd8d90a9cf6ab'),
  ('117363266248892446964', '0x80d956a44f631b8f:0xa959313559e63d5b'),
  ('117363266248892446964', '0x80d9576cc09702a1:0xc1cf0b4251c1b8b1'),
  ('117363266248892446964', '0x80d956976bcd72bb:0x7d52891225874b37'),
  ('117363266248892446964', '0x80deaa39e756ae7f:0xa3295058eb08d09'),
  ('117363266248892446964', '0x80d956e681bad859:0x3f79d11474ec6e5b'),
  ('115067976074036939

In [5]:
train_samples, test_samples = model.train_test_split(pos, neg, test_size=0.2)

In [6]:
sorted_businesses = model.baseline_model(train_samples)

In [7]:
total_revs = sum(count for count, business_id in sorted_businesses)
total_revs

4748

In [8]:
baseline_preds = model.baseline_predict(sorted_businesses, total_revs)

In [9]:
results = model.evaluate_model(test_samples, baseline_preds)
results

{'accuracy': 0.5174296514069718,
 'precision': 0.5481611208406305,
 'recall': 0.2599667774086379,
 'f1_score': 0.35267605633802823}

In [11]:
preds_collab = model.collaborative_filtering_predict(test_samples, user_businesses, similarity_threshold=0.1)
results_collab = model.evaluate_model_tuples(test_samples, preds_collab)
print("Collaborative filtering:", results_collab)

Collaborative filtering: {'accuracy': 0.5665686686266275, 'precision': 0.9942528735632183, 'recall': 0.143687707641196, 'f1_score': 0.251088534107402}
