In [2]:
import pandas as pd
import numpy as np
import json

In [3]:
# this should probably be on a seperate cell so I don't constantly reload the dataframes
fDefPath = 'reviews/yelpReviews/yelp_academic_dataset_'
# constants so I don't have to keep changing names
BS = 'business'
CH = 'checkin'
TI = 'tip'
RW = 'review'
US = 'user'

# constants for the file path
bspath = f'{fDefPath}{BS}.json'
chpath = f'{fDefPath}{CH}.json'
tipath = f'{fDefPath}{TI}.json'
rwpath = f'{fDefPath}{RW}.json'
uspath = f'{fDefPath}{US}.json'


In [13]:
# load the datasets
bsdf = pd.read_json(bspath, lines=True

In [5]:
bsdf = bsdf.dropna(subset=["categories"]).copy()
bsdf['categories'] = bsdf['categories'].str.lower()
bsdf.loc[:,"categoryList"] = bsdf.loc[:,"categories"].str.split(", ")

# Flatten and count unique categories
allCategories = bsdf["categoryList"].explode().str.lower()
categoryCounts = allCategories.value_counts()

# Preview common categories
#print(categoryCounts.head(50))

In [9]:
# finding what american new means
#bsdf[bsdf["categories"].str.contains(r"american \(new\)", na=False)].name.head(50)


In [6]:
# create a manual dictionary for now
categoryMap = {
    # removed food, too broad, was catching grocery stores
    'restaurants': {'restaurants', 'sandwiches', 'american (new)', 'american (traditional)', 'pizza', 'bakeries', 'desserts', 'cafes', 'breakfast & brunch'},
    'retail': {'shopping', 'home & garden', 'fashion', 'real estate', 'grocery', 'department stores', 'electronics', 'toys'},
    'beauty': {'beauty & spas', 'hair salons', 'nail salons', 'massage', 'skin care', 'makeup artists'},
    'auto': {'automotive', 'auto repair'},
    'services': {'home services', 'local services', 'event planning & services'},
    'fitness': {'active life', 'fitness & instruction', 'gyms', 'yoga', 'martial arts'},
    'health': {'health & medical', 'doctors', 'dentists'},
    'pets': {'veterinarians', 'pet services', 'pets'},
    'nightlife': {'nightlife', 'bars', 'lounges', 'dance clubs', 'karaoke'},
    'travel': {'hotels & travel', 'hotels', 'tours'},
    'religious': {'religious organizations', 'synagogues'},
    'entertainment': {'festivals', 'arts & entertainment', 'television', 'mass media', 'museums', 'music venues', 'cinema', 'theaters'},
    'finance': {'banks & credit unions', 'financial services'},
    'legal': {'dui law', 'lawyers'},
    'government': {'public services & government', 'libraries'},
    'education': {'education'},
    'professional services': {'employment agencies', 'professional services'}
}


In [7]:
def matchBroadCategory(catList):
    for broadCat, keywords in categoryMap.items():
        if any(cat in keywords for cat in catList):
            return broadCat
    return "other"

bsdf["broadCategory"] = bsdf["categoryList"].apply(matchBroadCategory)


In [8]:
fileName = 'files/categorized.csv'
bsdf[['name','categoryList','broadCategory']].to_csv(fileName)

In [9]:
# Reviews to looks at
sample_size = 10000
reviews = []
with open(rwpath, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= sample_size:
            break
        reviews.append(json.loads(line))

rdf = pd.DataFrame(reviews)
rdf.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny',
       'cool', 'text', 'date'],
      dtype='object')

In [43]:
# load the datasets
bsdf = pd.read_json(bspath, lines=True)
# now I'm going to process the attributes in the business json
attDf = bsdf.dropna(subset=['attributes']).copy()

In [68]:
attDf = bsdf.dropna(subset=['attributes']).copy()
attDf = attDf.attributes
attDf = pd.json_normalize(attDf, sep = '_')

import ast

for col in attDf.columns:
    if attDf[col].apply(lambda x: isinstance(x, str) and x.strip().startswith('{')).any():
        attDf[col] = attDf[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Find columns with nested dicts
nested_cols = [col for col in attDf.columns if isinstance(attDf.dropna(subset=[col])[col].iloc[0], dict)]

# Flatten each nested dict into new columns
for col in nested_cols:
    nested_df = pd.json_normalize(attDf[col])
    nested_df.columns = [f"{col}_{subcol}" for subcol in nested_df.columns]
    attDf = attDf.drop(columns=[col]).join(nested_df)

# print(attDf.dtypes)

In [69]:
attDf.to_csv('files/att.csv')

In [71]:
print(bsdf.attributes)

0                             {'ByAppointmentOnly': 'True'}
1                    {'BusinessAcceptsCreditCards': 'True'}
2         {'BikeParking': 'True', 'BusinessAcceptsCredit...
3         {'RestaurantsDelivery': 'False', 'OutdoorSeati...
4         {'BusinessAcceptsCreditCards': 'True', 'Wheelc...
                                ...                        
150341    {'ByAppointmentOnly': 'False', 'RestaurantsPri...
150342    {'BusinessAcceptsCreditCards': 'True', 'Restau...
150343    {'RestaurantsPriceRange2': '1', 'BusinessAccep...
150344    {'BusinessParking': '{'garage': False, 'street...
150345    {'WheelchairAccessible': 'True', 'BusinessAcce...
Name: attributes, Length: 150346, dtype: object
