In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
import numpy as np

## Load Business Category Data

In [2]:
# Get top-level business categories
with open("categories.json") as json_file:
    categories = json.load(json_file)

# Dictionary of category keys to the full strings that appear in the dataset
category_dict = {category["alias"]:category["title"] for category in categories}
category_dict_reverse = {category["title"]:category["alias"] for category in categories}

# Categories that we are interested in
#categories_of_interest = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']
categories_of_interest_raw = [category["alias"] for category in categories if category["parents"] == []]
categories_of_interest = [category_dict[cat] for cat in categories_of_interest_raw]

# print(categories_of_interest)
# print(categories_of_interest_raw)

## Business Data

### Load Dataset

In [3]:
businesses = pd.read_csv("yelp_academic_dataset_business.csv")

businesses.head(3)

Unnamed: 0,city,review_count,name,business_id,longitude,state,stars,address,latitude,metroarea,categories
0,Tucson,22,Target,tUFrWirKiKi_TAnsVWINQQ,-110.880452,AZ,3.5,5255 E Broadway Blvd,32.223236,Tucson,"Department Stores, Shopping, Fashion, Home & G..."
1,Philadelphia,80,St Honore Pastries,MTSW4McQd7CbVtyjqoe9mw,-75.155564,PA,4.0,935 Race St,39.955505,Philadelphia,"Restaurants, Food, Bubble Tea, Coffee & Tea, B..."
2,Nashville,10,Sonic Drive-In,bBDDEgkFA1Otx9Lfe7BZUQ,-86.76817,TN,1.5,2312 Dickerson Pike,36.208102,Nashville,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,..."


### Process Categories

In [4]:
# Create indicator columns for each business category (a business may belong to multiple categories)
businesses_categories = businesses.copy()

for category_str in categories_of_interest:
    colname = "is_" + category_dict_reverse[category_str]
    businesses_categories[colname] = businesses_categories["categories"].str.contains(category_str).fillna(False)
# businesses_categories.head(3)

## Counts Data (all metro areas combined) and num_businesses

### counts_df

In [5]:
metros = ['Indianapolis', 'Philadelphia', 'Tucson', 'Tampa', 'Nashville']
v2 = True

counts_df = pd.DataFrame()
# counts_df_norm = pd.DataFrame()

for metro in metros:
    if v2:
        suffix = "_v2.csv"
    else:
        suffix = ".csv"
        
    #counts_df_metro = pd.read_csv("output/" + metro + "_counts_final" + suffix)
    counts_df_metro = pd.read_csv("output_urbcomp/" + metro + "_counts_final" + suffix)
    counts_df_metro["metroarea"] = metro
    counts_df = counts_df.append(counts_df_metro)
    
#     counts_df_norm_metro = pd.read_csv("output/" + metro + "_counts_norm_final" + suffix)
#     counts_df_norm_metro["metroarea"] = metro
#     counts_df_norm = counts_df_norm.append(counts_df_norm_metro)

# counts_df.head()
#counts_df.to_csv("test/all_counts_final_v2.csv", index=False)
counts_df.to_csv("final_output_urbcomp/all_counts_final_v2.csv", index=False)
# counts_df_norm.to_csv("test/all_counts_norm_final_v2.csv", index=False)

  counts_df = counts_df.append(counts_df_metro)
  counts_df = counts_df.append(counts_df_metro)
  counts_df = counts_df.append(counts_df_metro)
  counts_df = counts_df.append(counts_df_metro)
  counts_df = counts_df.append(counts_df_metro)


#### num_businesses

In [6]:
categories_of_interest_small = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']
counts_df_mod_noprofs = counts_df.copy()
for col in ["is_"+col for col in categories_of_interest_small if col != "restaurants"]:
    counts_df_mod_noprofs[col] = np.logical_and(counts_df_mod_noprofs[col] == True, counts_df_mod_noprofs["is_restaurants"] == False)
cols = ["is_"+col for col in categories_of_interest_small]
#len(counts_df_mod[counts_df_mod[cols].sum(axis=1) > 1]) / len(counts_df_mod)

df_rows = []
# categories_of_interest_small = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']

cols = [counts_df.columns[0]] + list(counts_df.columns[3:45])
# print(cols)

for category in categories_of_interest_small:
    df_row = [category]
    q = "is_" + category
    counts_category = counts_df_mod_noprofs.query(q)
    num_bus_in_cat = len(counts_category)
    counts_df_temp = counts_category[cols[1:]]
    #print(np.count_nonzero(counts_df_temp))
    df_row.append(num_bus_in_cat)
    df_row.extend(np.count_nonzero(counts_df_temp, axis=0))
    df_rows.append(df_row)
    
num_businesses_df = pd.DataFrame(df_rows,columns = ["category", "num_businesses"] + cols[1:])
    
# num_businesses_df

#num_businesses_df.to_csv("test/num_businesses.csv", index=False)
num_businesses_df.to_csv("final_output_urbcomp/num_businesses.csv", index=False)

### counts_df_norm and table

#### Initial Data Prep

In [7]:
metros = ['Indianapolis', 'Philadelphia', 'Tucson', 'Tampa', 'Nashville']
v2 = True

# counts_df = pd.DataFrame()
counts_df_norm = pd.DataFrame()

for metro in metros:
    if v2:
        suffix = "_v2.csv"
    else:
        suffix = ".csv"
        
#     counts_df_metro = pd.read_csv("output/" + metro + "_counts_final" + suffix)
#     counts_df_metro["metroarea"] = metro
#     counts_df = counts_df.append(counts_df_metro)
    
    #counts_df_norm_metro = pd.read_csv("output/" + metro + "_counts_norm_final" + suffix)
    counts_df_norm_metro = pd.read_csv("output_urbcomp/" + metro + "_counts_norm_final" + suffix)
    counts_df_norm_metro["metroarea"] = metro
    counts_df_norm = counts_df_norm.append(counts_df_norm_metro)

# counts_df.head()
# counts_df.to_csv("test/all_counts_final_v2.csv", index=False)
# counts_df_norm.to_csv("test/all_counts_norm_final_v2.csv", index=False)

  counts_df_norm = counts_df_norm.append(counts_df_norm_metro)
  counts_df_norm = counts_df_norm.append(counts_df_norm_metro)
  counts_df_norm = counts_df_norm.append(counts_df_norm_metro)
  counts_df_norm = counts_df_norm.append(counts_df_norm_metro)
  counts_df_norm = counts_df_norm.append(counts_df_norm_metro)


#### Table

In [8]:
metros = ['Indianapolis', 'Philadelphia', 'Tucson', 'Tampa', 'Nashville']
metros = sorted(metros)  # , reverse=True)
#metros.append("Total")
#metros.reverse()

# print(metros)

In [9]:
cols = list(counts_df_norm.columns[3:41])
# cols.remove("professor")
# len(cols)#.remove('professor')
# print(cols)

In [10]:
# counts_df_norm = pd.read_csv("output/" + "all" + "_counts_norm_final_v2.csv")
df = pd.DataFrame(counts_df_norm[cols].mean().sort_values(ascending=False))
#df

for metro in metros:
    #counts_df_norm_metro = pd.read_csv("output/" + metro + "_counts_norm_final_v2.csv")
    counts_df_norm_metro = pd.read_csv("output_urbcomp/" + metro + "_counts_norm_final_v2.csv")
    df[metro] = pd.DataFrame(counts_df_norm_metro[cols].mean().sort_values(ascending=False))
df = df.reset_index()
#df
df = df.rename(columns={"index": "Word", 0: "Total"})
df

Unnamed: 0,Word,Total,Indianapolis,Nashville,Philadelphia,Tampa,Tucson
0,friend,34.988299,31.336934,33.895771,43.438119,28.340934,27.906681
1,husband,27.773026,32.489204,31.862854,20.134221,30.226029,33.750999
2,wife,18.665601,22.731776,21.112884,13.476549,19.260892,23.352402
3,boyfriend,13.348852,10.847938,11.254804,17.89656,11.875833,9.230805
4,daughter,8.651353,9.547171,7.093802,6.405125,10.188148,12.4856
5,mother,8.617618,8.901978,7.347718,8.089661,8.382314,10.865036
6,son,6.952659,7.501825,5.924803,4.793205,8.590135,10.328332
7,girlfriend,5.473339,3.701642,5.020835,7.141178,5.343709,4.009599
8,child,5.389661,6.787073,4.451083,3.527282,6.47418,7.874863
9,sister,4.452665,4.557279,4.120597,4.826076,3.759701,4.564911


In [11]:
df.to_latex(buf="output_urbcomp/table.txt", float_format="%.2f", index=False)
#df.to_latex(buf="output/table.txt", float_format="%.2f", index=False)

  df.to_latex(buf="output_urbcomp/table.txt", float_format="%.2f", index=False)


#### counts_df_norm file

In [12]:
categories_of_interest = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'nightlife', 'restaurants', 'shopping']
q = " or ".join(["is_"+cat for cat in categories_of_interest])
# "(" + q + ") and is_health == False and is_homeservices == False"

In [13]:
final_table = counts_df_norm.query(q)

for col in ["is_"+col for col in categories_of_interest if col != "restaurants"]:
    final_table[col] = np.logical_and(final_table[col] == True, final_table["is_restaurants"] == False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_table[col] = np.logical_and(final_table[col] == True, final_table["is_restaurants"] == False)


In [14]:
#final_table.to_csv("test/all_counts_norm_final_v2.csv", index=False)
final_table.to_csv("final_output_urbcomp/all_counts_norm_final_v2.csv", index=False)

#### counts_df_norm_formaps file

In [15]:
#counts_df_norm.head()

cols_of_interest = ['business_id', 'num_reviews', 'num_relationship_words',
                    'family', 'romantic', 'friendship', 'professional',
                    'name', 'categories', 'stars', 'review_count',
                    'address', 'city', 'state', 'longitude', 'latitude', 'metroarea']

cols_of_interest.extend(["is_"+cat for cat in categories_of_interest])
# print(cols_of_interest)

In [16]:
import scipy.stats as stats
counts_df_norm_fewcols = final_table[cols_of_interest]


# Calculate p-values and confidence levels for the z-scores
for col in ['family', 'romantic', 'friendship', 'professional']:
    zscore_col = f"{col}_zscore"
    counts_df_norm_fewcols[zscore_col] = (
        counts_df_norm_fewcols[col] - counts_df_norm_fewcols[col].mean()) / counts_df_norm_fewcols[col].std()
    counts_df_norm_fewcols[f"{col}_pvalue"] = stats.norm.sf(
        abs(counts_df_norm_fewcols[zscore_col])) * 2
    counts_df_norm_fewcols[f"{col}_confidence"] = (
        1 - counts_df_norm_fewcols[f"{col}_pvalue"]) * 100

# Print the confidence levels
# for col in ['family', 'romantic', 'friendship', 'professional']:
#     print(
#         f"{col} confidence level: {counts_df_norm_fewcols[f'{col}_confidence'].mean():.2f}%")
    
print(counts_df_norm_fewcols.head(3))

# businesses_renamed = businesses.rename(columns={"business": "business_id"})
# counts_df_norm_merged = pd.merge(counts_df_norm_fewcols, businesses_renamed, how="inner", on="business_id")
#counts_df_norm_fewcols.to_csv("test/all_counts_norm_fewcols_formaps.csv", index=False)
counts_df_norm_fewcols.to_csv("final_output_urbcomp/all_counts_norm_fewcols_formaps.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_df_norm_fewcols[zscore_col] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_df_norm_fewcols[f"{col}_pvalue"] = stats.norm.sf(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  counts_df_norm_fewcols[f"{col}_confidence"] = (
A value is trying to be set on a copy of a slice from a DataF

              business_id  num_reviews  num_relationship_words     family  \
0  EQ-TZ2eeD_E0BHuvoaeG5Q         1273              194.029851  37.706206   
1  cPepkJeRMtHapc_b2Oe_dw           34              176.470588  29.411765   
2  oJ4ik-4PZe6gexxW-tSmsw          335              244.776119  32.835821   

     romantic  friendship  professional                     name  \
0   84.838963   69.128044      2.356638                Milktooth   
1   88.235294   58.823529      0.000000  Naked Tchopstix Express   
2  140.298507   65.671642      5.970149     Broad Ripple Brewpub   

                                          categories  stars  ...  \
0  Beer, Wine & Spirits, Cafes, Coffee & Tea, Res...    4.0  ...   
1      Restaurants, Food, Poke, Hawaiian, Sushi Bars    3.5  ...   
2  Nightlife, Food, Pizza, Brewpubs, Restaurants,...    3.5  ...   

   family_confidence romantic_zscore romantic_pvalue romantic_confidence  \
0          12.311857        0.236686        0.812901           18.709