In [2]:
import pandas as pd
import numpy as np
import gzip
import json
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import geopandas as gpd
from collections import defaultdict

# CSE 158 Assignment 2

### Issac, Rushi, Ty, Noah

This project analyzes Google Local business data to predict whether a user will review a business. We use exploratory data analysis (EDA) to understand patterns in user behavior, business characteristics, and review distributions. We build and evaluate multiple models, including baseline popularity-based approaches and collaborative filtering, to predict user-business review interactions. The models are evaluated using standard classification metrics (accuracy, precision, recall, F1-score) to identify the most effective approach for this prediction task.

## Table of Contents

- **Data**
- **EDA**
- **Selecting Topics**
- **Modeling**
- **Evaluation**
- **Conclusion**

## Data

We'll begin by importing our data and taking a look at some sample data points.

In [4]:
meta_sd = []
with gzip.open("Data/meta-SanDiego.json.gz", "rt", encoding="utf-8") as f:
    for line in f:
        meta_sd.append(json.loads(line))
        
review_sd = []
for i in range(1, 5):
    with gzip.open(f"Data/review-SanDiego_10_file{i}.json.gz", "rt", encoding="utf-8") as f:
        for line in f:
            review_sd.append(json.loads(line))
            
location_dict = {b['gmap_id']: (b['latitude'],b['longitude']) for b in meta_sd}
review_coords = [location_dict[review_sd[i]['gmap_id']] for i in range(len(review_sd))]
filtered_review_locations = [
    tup for tup in review_coords
    if tup[1] < -116.5
    and tup[0] < 33.8
]

lats = list()
longs = list()
for lat, long in filtered_review_locations:
    lats.append(lat)
    longs.append(long)

In [5]:
meta_sd[0]

{'name': 'HDR',
 'address': 'HDR, 591 Camino De La Reina suite 300, San Diego, CA 92108',
 'gmap_id': '0x80dbffc64011f711:0x6d7970e6968f3f92',
 'description': None,
 'latitude': 32.765555,
 'longitude': -117.16167399999999,
 'category': ['Engineering consultant',
  'Architecture firm',
  'Civil engineering company',
  'Construction company',
  'Environmental engineer'],
 'avg_rating': 5,
 'num_of_reviews': 3,
 'price': None,
 'hours': [['Thursday', '8AM–5PM'],
  ['Friday', '8AM–5PM'],
  ['Saturday', 'Closed'],
  ['Sunday', 'Closed'],
  ['Monday', '8AM–5PM'],
  ['Tuesday', '8AM–5PM'],
  ['Wednesday', '8AM–5PM']],
 'MISC': {'Accessibility': ['Wheelchair accessible entrance']},
 'state': 'Open ⋅ Closes 5PM',
 'relative_results': ['0x80dbff98570bd5a1:0x53a6df3b4d73da85',
  '0x80d9538e0c2e60a7:0xfb635fe02b1e06a7',
  '0x80dc06d36b327ac1:0x39bab4445058dded',
  '0x80dbfe4ca26f4b73:0x1bda90d940f9cbc8',
  '0x80dbffb6cf816d2f:0x25a53967cb9bc656'],
 'url': 'https://www.google.com/maps/place//data=

In [6]:
review_sd[0]

{'user_id': '102919413961325598675',
 'name': 'Humberto Garcia Jr.',
 'time': 1569188767987,
 'rating': 4,
 'text': 'Definitely something worth trying. If you’ve never had Hard Kombucha you need to come here. It’s very different and the inside is beautiful. They have a variety of games and there is plenty of seating.',
 'pics': [{'url': ['https://lh5.googleusercontent.com/p/AF1QipOQvLGEgw3N8Q6ZuF-6-bTuJsJyc5w_hRqhcDmK=w150-h150-k-no-p']},
  {'url': ['https://lh5.googleusercontent.com/p/AF1QipNpJMOyc4I4x1ZtgV8OJ6zEdSyyRO34Y9Ug2GIT=w150-h150-k-no-p']},
  {'url': ['https://lh5.googleusercontent.com/p/AF1QipNbBgiCUpVcEA8yn_ZqpfyTnK5OLxBRyIAdnLA2=w150-h150-k-no-p']},
  {'url': ['https://lh5.googleusercontent.com/p/AF1QipNToM7NgDO_WrCWM0L0sX9kpYIQryoxdEpUP8kT=w150-h150-k-no-p']}],
 'resp': None,
 'gmap_id': '0x80dc7588a897ed91:0x190ac94f6ebb8c76'}