# Text Mining

## Load Packages

In [49]:
# Import packages to use later
import pandas as pd
import nltk
import time
import json

## Process Data

In [50]:
metros = ['Indianapolis', 'Philadelphia', 'Tucson', 'Tampa', 'Nashville']
v2 = True

metro = 'Nashville'
if v2:
    #relationship_df_raw = pd.read_csv("output/" + metro + "_counts_v2.csv")
    relationship_df_raw = pd.read_csv("output_urbcomp/" + metro + "_counts_v2.csv")
else:
    #relationship_df_raw = pd.read_csv("output/" + metro + "_counts.csv")
    relationship_df_raw = pd.read_csv("output_urbcomp/" + metro + "_counts.csv")

In [51]:
relationship_df_raw.head()

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,partner,professor,relationship,siblings,sister,son,spouse,teacher,uncle,wife
0,ltBBYdNzkeKdCNPDAsxwAA,1158,216,0,0,0,0,25,2,3,...,1,0,0,0,4,5,0,0,0,27
1,8QnuWGVNBhzyYXGSeRdi4g,18,3,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Zx7n8mdt8OzLRXVzolXNhQ,1328,322,0,0,0,0,32,3,2,...,1,0,1,0,12,10,1,0,0,35
3,eaJCpC6IhYphj7bwCDHTwQ,202,16,0,0,0,0,0,0,0,...,0,0,0,0,2,1,0,0,0,4
4,oQ5CPRt0R3AzFvcjNOqB1w,954,189,1,0,0,0,29,1,4,...,0,0,0,0,8,4,2,0,0,23


In [52]:
# Put relationships into bins ("romantic", "family", etc.) for aggregation
with open("relationships/Relationships_ATUS_custom_v2_binned_update.txt") as f:
    relationships_binned_raw = [line.strip().lower() for line in f.readlines()]

relationships_binned = dict()
for line in relationships_binned_raw:
    rel_type, rel_list = line.split(":")
    relationships_binned[rel_type] = rel_list.split(",")
print(relationships_binned)

{'family': ['child', 'daughter', 'son', 'parent', 'mother', 'father', 'brother', 'sister', 'siblings', 'aunt', 'uncle', 'niece', 'nephew', 'cousin', 'grandchild', 'grandmother', 'grandfather', 'grandparents'], 'romantic': ['partner', 'relationship', 'date', 'boo', 'fiancee', 'girlfriend', 'boyfriend', 'spouse', 'husband', 'wife'], 'friendship': ['bff', 'friend', 'housemate', 'neighbor'], 'professional': ['classmate', 'professor', 'teacher', 'coworker', 'client', 'boss']}


In [53]:
# Compute counts for relationship category bins
relationship_df = relationship_df_raw.copy()

for rel_type, rel_list in relationships_binned.items():
    relationship_df[rel_type] = relationship_df[rel_list].sum(axis=1)

relationship_df.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,sister,son,spouse,teacher,uncle,wife,family,romantic,friendship,professional
0,ltBBYdNzkeKdCNPDAsxwAA,1158,216,0,0,0,0,25,2,3,...,4,5,0,0,0,27,32,116,64,4
1,8QnuWGVNBhzyYXGSeRdi4g,18,3,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,2,0,0
2,Zx7n8mdt8OzLRXVzolXNhQ,1328,322,0,0,0,0,32,3,2,...,12,10,1,0,0,35,57,153,108,4


In [54]:
# Compute normalized counts
relationship_df_norm = relationship_df.copy()
relationship_df_norm = relationship_df_norm[relationship_df_norm["num_reviews"] >= 30]
relationship_df_norm[relationship_df_norm.columns[2:]] = relationship_df_norm[relationship_df_norm.columns[2:]].div(relationship_df_norm.num_reviews, axis=0) * 1000

relationship_df_norm.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,sister,son,spouse,teacher,uncle,wife,family,romantic,friendship,professional
0,ltBBYdNzkeKdCNPDAsxwAA,1158,186.528497,0.0,0.0,0.0,0.0,21.588946,1.727116,2.590674,...,3.454231,4.317789,0.0,0.0,0.0,23.316062,27.633851,100.172712,55.267703,3.454231
2,Zx7n8mdt8OzLRXVzolXNhQ,1328,242.46988,0.0,0.0,0.0,0.0,24.096386,2.259036,1.506024,...,9.036145,7.53012,0.753012,0.0,0.0,26.355422,42.921687,115.210843,81.325301,3.012048
3,eaJCpC6IhYphj7bwCDHTwQ,202,79.207921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.90099,4.950495,0.0,0.0,0.0,19.80198,14.851485,29.70297,29.70297,4.950495


In [55]:
# Compute percentages for each relationship category bin
for relationship in relationships_binned.keys():
    newcol = relationship + "_pct"
    relationship_df[newcol] = relationship_df[relationship].div(relationship_df["num_relationship_words"])

relationship_df.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,uncle,wife,family,romantic,friendship,professional,family_pct,romantic_pct,friendship_pct,professional_pct
0,ltBBYdNzkeKdCNPDAsxwAA,1158,216,0,0,0,0,25,2,3,...,0,27,32,116,64,4,0.148148,0.537037,0.296296,0.018519
1,8QnuWGVNBhzyYXGSeRdi4g,18,3,0,0,0,0,0,0,0,...,0,0,1,2,0,0,0.333333,0.666667,0.0,0.0
2,Zx7n8mdt8OzLRXVzolXNhQ,1328,322,0,0,0,0,32,3,2,...,0,35,57,153,108,4,0.177019,0.475155,0.335404,0.012422


### Businesses

In [56]:
# Get top-level business categories
with open("categories.json") as json_file:
    categories = json.load(json_file)

# Dictionary of category keys to the full strings that appear in the dataset
category_dict = {category["alias"]:category["title"] for category in categories}
category_dict_reverse = {category["title"]:category["alias"] for category in categories}

# Categories that we are interested in
#categories_of_interest = ['active', 'arts', 'beautysvc', 'food', 'hotelstravel', 'localflavor', 'nightlife', 'restaurants', 'shopping']
categories_of_interest_raw = [category["alias"] for category in categories if category["parents"] == []]
categories_of_interest = [category_dict[cat] for cat in categories_of_interest_raw]

print(categories_of_interest)
print(categories_of_interest_raw)

['Active Life', 'Arts & Entertainment', 'Automotive', 'Beauty & Spas', 'Bicycles', 'Education', 'Event Planning & Services', 'Financial Services', 'Food', 'Health & Medical', 'Home Services', 'Hotels & Travel', 'Local Flavor', 'Local Services', 'Mass Media', 'Nightlife', 'Pets', 'Professional Services', 'Public Services & Government', 'Religious Organizations', 'Restaurants', 'Shopping']
['active', 'arts', 'auto', 'beautysvc', 'bicycles', 'education', 'eventservices', 'financialservices', 'food', 'health', 'homeservices', 'hotelstravel', 'localflavor', 'localservices', 'massmedia', 'nightlife', 'pets', 'professional', 'publicservicesgovt', 'religiousorgs', 'restaurants', 'shopping']


In [57]:
# Create indicator columns for each business category (a business may belong to multiple categories)
#businesses = pd.read_csv("yelp_academic_dataset_business_clean.csv")
businesses = pd.read_csv("yelp_academic_dataset_business.csv")
businesses = businesses.rename(columns={"business": "business_id"})

for category_str in categories_of_interest:
    colname = "is_" + category_dict_reverse[category_str]
    businesses[colname] = businesses["categories"].str.contains(category_str).fillna(False)
businesses.head()

Unnamed: 0,city,review_count,name,business_id,longitude,state,stars,address,latitude,metroarea,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,Tucson,22,Target,tUFrWirKiKi_TAnsVWINQQ,-110.880452,AZ,3.5,5255 E Broadway Blvd,32.223236,Tucson,...,False,False,False,False,False,False,False,False,False,True
1,Philadelphia,80,St Honore Pastries,MTSW4McQd7CbVtyjqoe9mw,-75.155564,PA,4.0,935 Race St,39.955505,Philadelphia,...,False,False,False,False,False,False,False,False,True,False
2,Nashville,10,Sonic Drive-In,bBDDEgkFA1Otx9Lfe7BZUQ,-86.76817,TN,1.5,2312 Dickerson Pike,36.208102,Nashville,...,False,False,False,False,False,False,False,False,True,False
3,Indianapolis,28,Denny's,il_Ro8jwPlHresjw9EGmBg,-86.127217,IN,2.5,8901 US 31 S,39.637133,Indianapolis,...,False,False,False,False,False,False,False,False,True,False
4,Philadelphia,245,Tuna Bar,MUTTqe8uqyMdBl186RmNeA,-75.143226,PA,4.0,205 Race St,39.953949,Philadelphia,...,False,False,False,False,False,False,False,False,True,False


## Final Merged Counts Dataframes

In [58]:
relationship_df_final = pd.merge(relationship_df, businesses, how="inner", on="business_id")
relationship_df_final.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,ltBBYdNzkeKdCNPDAsxwAA,1158,216,0,0,0,0,25,2,3,...,False,False,False,True,False,False,False,False,True,False
1,8QnuWGVNBhzyYXGSeRdi4g,18,3,0,0,0,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,Zx7n8mdt8OzLRXVzolXNhQ,1328,322,0,0,0,0,32,3,2,...,False,False,False,False,False,False,False,False,True,False


In [59]:
relationship_df_norm_final = pd.merge(relationship_df_norm, businesses, how="inner", on="business_id")
relationship_df_norm_final.head(3)

Unnamed: 0,business_id,num_reviews,num_relationship_words,aunt,bff,boo,boss,boyfriend,brother,child,...,is_localflavor,is_localservices,is_massmedia,is_nightlife,is_pets,is_professional,is_publicservicesgovt,is_religiousorgs,is_restaurants,is_shopping
0,ltBBYdNzkeKdCNPDAsxwAA,1158,186.528497,0.0,0.0,0.0,0.0,21.588946,1.727116,2.590674,...,False,False,False,True,False,False,False,False,True,False
1,Zx7n8mdt8OzLRXVzolXNhQ,1328,242.46988,0.0,0.0,0.0,0.0,24.096386,2.259036,1.506024,...,False,False,False,False,False,False,False,False,True,False
2,eaJCpC6IhYphj7bwCDHTwQ,202,79.207921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,True


In [60]:
# Write final CSV files
if v2:
    #relationship_df_final.to_csv("output/" + metro + "_counts_final_v2.csv", index=False)
    #relationship_df_norm_final.to_csv("output/" + metro + "_counts_norm_final_v2.csv", index=False)
    relationship_df_final.to_csv("output_urbcomp/" + metro + "_counts_final_v2.csv", index=False)
    relationship_df_norm_final.to_csv("output_urbcomp/" + metro + "_counts_norm_final_v2.csv", index=False)
else:
    #relationship_df_final.to_csv("output/" + metro + "_counts_final.csv", index=False)
    #relationship_df_norm_final.to_csv("output/" + metro + "_counts_norm_final.csv", index=False)
    relationship_df_final.to_csv("output_urbcomp/" + metro + "_counts_final.csv", index=False)
    relationship_df_norm_final.to_csv("output_urbcomp/" + metro + "_counts_norm_final.csv", index=False)