In [1]:
import json
import gzip
from collections import defaultdict
import model

In [2]:
reviews = []

for file in ["Data/review-SanDiego_10_file1.json.gz", "Data/review-SanDiego_10_file2.json.gz", 
             "Data/review-SanDiego_10_file3.json.gz", "Data/review-SanDiego_10_file4.json.gz"]:
    with gzip.open(file, 'rt', encoding='utf-8') as f:
        for line in f:
            reviews.append(json.loads(line))

meta = {}

with gzip.open("Data/meta-SanDiego.json.gz", 'rt', encoding='utf-8') as f:
    for line in f:
        business = json.loads(line)
        meta[business['gmap_id']] = business

In [3]:
reviews_valid, meta_valid = model.create_validation_set(reviews, meta, val_size=1000)

In [4]:
pos, neg  = model.generate_samples(reviews_valid, meta_valid)

# pos, neg

In [5]:
train_samples, test_samples = model.train_test_split(pos, neg, test_size=0.2)

In [6]:
user_businesses = defaultdict(set)

for rev in reviews_valid:
    user_id = rev['user_id']
    business_id = rev['gmap_id']
    user_businesses[user_id].add(business_id)


In [7]:
sorted_businesses = model.baseline_model(train_samples)

In [8]:
total_revs = sum(count for count, business_id in sorted_businesses)
total_revs

4262

In [9]:
baseline_preds = model.baseline_predict(sorted_businesses, total_revs)

In [10]:
results = model.evaluate_baseline_model(test_samples, baseline_preds)
results

{'accuracy': 0.5030388031790556,
 'precision': 0.5187713310580204,
 'recall': 0.280184331797235,
 'f1_score': 0.36385397965290245}

In [15]:
preds_collab = model.collaborative_filtering_predict(test_samples, user_businesses, similarity_threshold=0.01)
results_collab = model.evaluate_filtering_model(test_samples, preds_collab)
print("Collaborative filtering:", results_collab)

Collaborative filtering: {'accuracy': 0.6919121084618981, 'precision': 0.9192913385826772, 'recall': 0.4304147465437788, 'f1_score': 0.5863151286880102}


In [13]:
preds_no_location = model.collaborative_filtering_with_location(
    test_samples,
    user_businesses,
    meta_valid,
    similarity_threshold=0.01,
    min_similar_reviews=1,
    use_distance=False  # Disable distance filtering
)
results_no_location = model.evaluate_filtering_model(test_samples, preds_no_location)

print("Collaborative Filtering (NO location):")
print(f"  Accuracy:  {results_no_location['accuracy']:.4f}")
print(f"  Precision: {results_no_location['precision']:.4f}")
print(f"  Recall:    {results_no_location['recall']:.4f}")
print(f"  F1 Score:  {results_no_location['f1_score']:.4f}")

Collaborative Filtering (NO location):
  Accuracy:  0.6919
  Precision: 0.9193
  Recall:    0.4304
  F1 Score:  0.5863
