In [1]:
import time
import pandas as pd
import numpy as np
from columnExpander import DictColumnExpander, ListColumnExpander

In [2]:
biz_df = pd.read_json("./data/business.json", lines=True)
biz_df["price"] = biz_df["attributes"].apply(lambda d: d.get("RestaurantsPriceRange2") if type(d) == dict else np.nan)

In [3]:
# Grabbing subset to experiment with
df = biz_df[:120]

### Using DictColumnExpander to extract `attributes` column

In [4]:
dce = DictColumnExpander("attributes")

In [5]:
now = time.time()
all_attrs = dce.fit_transform(df, verbose = True)
print(round(time.time() - now, 2), "seconds")

Starting transformation:
0.00% Completed
10.00% Completed
20.00% Completed
30.00% Completed
40.00% Completed
50.00% Completed
60.00% Completed
70.00% Completed
80.00% Completed
90.00% Completed
Transformation Complete
0.07 seconds


In [6]:
attr_dummies = pd.DataFrame(data = all_attrs.toarray(), columns=dce.get_feature_names())
attr_dummies.head()

Unnamed: 0,GoodForKids_False,GoodForKids_True,RestaurantsReservations_False,RestaurantsReservations_True,GoodForMeal_latenight,GoodForMeal_dessert,GoodForMeal_breakfast,GoodForMeal_brunch,GoodForMeal_lunch,GoodForMeal_dinner,...,Smoking_outdoor,AgesAllowed_allages,HairSpecializesIn_kids,HairSpecializesIn_africanamerican,HairSpecializesIn_perms,HairSpecializesIn_coloring,HairSpecializesIn_extensions,HairSpecializesIn_curly,HairSpecializesIn_asian,HairSpecializesIn_straightperms
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Using ListColumnExpander to extract `categories` column

In [7]:
from columnExpander import ListColumnExpander
lce = ListColumnExpander("categories")

In [8]:
all_cats = lce.fit_transform(df, verbose = True)

Starting transformation:
0.00% Completed
9.91% Completed
19.81% Completed
29.72% Completed
39.62% Completed
49.53% Completed
59.43% Completed
69.34% Completed
79.25% Completed
89.15% Completed
99.06% Completed
Transformation Complete


In [10]:
cat_dummies = pd.DataFrame(data = all_cats.toarray(), columns=lce.get_feature_names())
cat_dummies.head()

Unnamed: 0,categories_Beauty & Spas,categories_Convenience Stores,categories_Hookah Bars,categories_Naturopathic/Holistic,categories_Chicken Wings,categories_Art Classes,categories_Books,categories_Department Stores,categories_Middle Eastern,categories_Lawn Services,...,categories_Specialty Food,categories_Tex-Mex,categories_Gyms,categories_Vegan,categories_Arts & Entertainment,categories_Financial Services,categories_Florists,categories_Internet Service Providers,categories_Hotels & Travel,categories_Cupcakes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
total_dummies = pd.merge(attr_dummies, cat_dummies, left_index=True, right_index=True)
total_dummies.head()

Unnamed: 0,GoodForKids_False,GoodForKids_True,RestaurantsReservations_False,RestaurantsReservations_True,GoodForMeal_latenight,GoodForMeal_dessert,GoodForMeal_breakfast,GoodForMeal_brunch,GoodForMeal_lunch,GoodForMeal_dinner,...,categories_Specialty Food,categories_Tex-Mex,categories_Gyms,categories_Vegan,categories_Arts & Entertainment,categories_Financial Services,categories_Florists,categories_Internet Service Providers,categories_Hotels & Travel,categories_Cupcakes
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
