In [1]:
import json
import pandas as pd
import re
from IPython.utils.text import marquee

pd.options.display.max_colwidth = 100
pd.options.display.max_rows = 1000

In [2]:
def ToExcel(df, filename):
    with pd.ExcelWriter(filename, engine='xlsxwriter', engine_kwargs={'options':{'strings_to_numbers': True}}) as writer:
        df.to_excel(writer, sheet_name='Sheet1', startrow=1, header=False, index=False)
        (max_row, max_col) = df.shape
        column_settings = [{'header' : c} for c in df.columns]
        writer.sheets['Sheet1'].add_table(0, 0, max_row, max_col-1, {'columns' : column_settings})

# Read MinnesotaStateFair JSON file

In [3]:
with open('./data/MinnesotaStateFair-2023.json', 'rb') as f:
    js = json.loads(f.read())

In [4]:
sorted(list(js.keys()))

['_entertainerWords',
 '_vendorWords',
 'days',
 'entertainmentCategories',
 'entertainmentEntertainers',
 'entertainmentEvents',
 'entertainmentTickets',
 'locations',
 'newAttractions',
 'nonFairEvents',
 'sponsorEntertainers',
 'sponsorLocations',
 'sponsors',
 'vendorCategories',
 'vendorNewThisYear',
 'vendors',
 'versions']

In [38]:
tables = [
    'days', 
    'entertainmentCategories', 'entertainmentEntertainers', 'entertainmentEvents',
    'locations', 'newAttractions',     
    'sponsors', 'sponsorLocations',
    'vendors', 'vendorCategories', 'vendorNewThisYear', 
]

for table in tables:
    df = pd.DataFrame().from_dict(js[table], orient='columns')
    df.to_csv(f'./csv/{table}.csv', index=False)
    ToExcel(df, (f'./excel/{table}.xlsx'))

# Events

In [6]:
df_entertainment_events = pd.read_csv('./csv/entertainmentEvents.csv', parse_dates=['datetime_start', 'datetime_end'])#, dtype={'vendor_id': 'object'})
display(df_entertainment_events.head(1))

Unnamed: 0,id_event,id_entertainer,id_location,slug,datetime_start,datetime_end,tour_name,billing_order,ticket_url,seating_description,all_day_flag,name,type,latitude,longitude,location_path,location
0,2,1,115,duran-duran,2023-08-31 19:00:00,NaT,,1,https://event.etix.com/ticket/p/48181933/,,0,DURAN DURAN: FUTURE PAST with special guests Bastille and Nile Rodgers & CHIC,grandstand,44.981997,-93.173279,Grandstand Building ➜ Grandstand Stage,Grandstand Stage


In [7]:
df_entertainment_events.isna().sum()

id_event                  0
id_entertainer            0
id_location               0
slug                      1
datetime_start            0
datetime_end           1902
tour_name              3827
billing_order             0
ticket_url             3819
seating_description    3828
all_day_flag              0
name                      0
type                      0
latitude                138
longitude               138
location_path          2510
location                  0
dtype: int64

In [8]:
df_entertainment_events[df_entertainment_events['latitude'].isna()].head()

Unnamed: 0,id_event,id_entertainer,id_location,slug,datetime_start,datetime_end,tour_name,billing_order,ticket_url,seating_description,all_day_flag,name,type,latitude,longitude,location_path,location
1509,2064,460,1,daily-parade-marching-bands-1,2023-08-24 14:00:00,NaT,,0,,,0,Daily Parade - Marching Bands,schedule,,,,Fairgrounds
1510,2066,461,1,daily-parade-marching-bands-2,2023-08-25 14:00:00,NaT,,0,,,0,Daily Parade - Marching Bands,schedule,,,,Fairgrounds
1511,2068,462,1,daily-parade-marching-bands-3,2023-08-26 14:00:00,NaT,,0,,,0,Daily Parade - Marching Bands,schedule,,,,Fairgrounds
1512,2070,463,1,daily-parade-marching-bands-4,2023-08-27 14:00:00,NaT,,0,,,0,Daily Parade - Marching Bands,schedule,,,,Fairgrounds
1513,2072,464,1,daily-parade-marching-bands-5,2023-08-28 14:00:00,NaT,,0,,,0,Daily Parade - Marching Bands,schedule,,,,Fairgrounds


In [10]:
for col in ['type', 'all_day_flag', 'name']:
    print(marquee(col))
    display(df_entertainment_events[col].value_counts().head(10))

************************************ type ************************************


type
schedule      3816
grandstand      12
Name: count, dtype: int64

******************************** all_day_flag ********************************


all_day_flag
0    2892
1     936
Name: count, dtype: int64

************************************ name ************************************


name
Fish Pond Talks                     83
Moo Booth: Video Presentations      66
Moo Booth: Milking Demonstration    59
Looking Inside the Hive             59
Surgery Suite                       48
Timberworks Lumberjack Show         48
Harvesting Honey                    47
4-H State Arts-In Musical           38
Thank a Farmer Magic Show           36
All-Star Stunt Dogs Splash          36
Name: count, dtype: int64

In [11]:
df_entertainment_events[df_entertainment_events['all_day_flag'] == 1]

Unnamed: 0,id_event,id_entertainer,id_location,slug,datetime_start,datetime_end,tour_name,billing_order,ticket_url,seating_description,all_day_flag,name,type,latitude,longitude,location_path,location
868,1136,183,330,air-quality-what-you-can-do,2023-08-24 09:00:00,2023-08-24 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
869,1138,183,330,air-quality-what-you-can-do,2023-08-25 09:00:00,2023-08-25 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
870,1139,183,330,air-quality-what-you-can-do,2023-08-26 09:00:00,2023-08-26 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
871,1140,183,330,air-quality-what-you-can-do,2023-08-27 09:00:00,2023-08-27 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
872,1141,183,330,air-quality-what-you-can-do,2023-08-28 09:00:00,2023-08-28 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
873,1142,183,330,air-quality-what-you-can-do,2023-08-29 09:00:00,2023-08-29 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
874,1143,183,330,air-quality-what-you-can-do,2023-08-30 09:00:00,2023-08-30 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
875,1144,183,330,air-quality-what-you-can-do,2023-08-31 09:00:00,2023-08-31 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
876,1145,183,330,air-quality-what-you-can-do,2023-09-01 09:00:00,2023-09-01 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience
877,1146,183,330,air-quality-what-you-can-do,2023-09-02 09:00:00,2023-09-02 21:00:00,,0,,,1,"Air Quality, What You Can Do",schedule,44.984882,-93.167999,,Eco Experience


In [12]:
len(df_entertainment_events['name'].unique())

906

In [13]:
df_entertainment_events.head(1)

Unnamed: 0,id_event,id_entertainer,id_location,slug,datetime_start,datetime_end,tour_name,billing_order,ticket_url,seating_description,all_day_flag,name,type,latitude,longitude,location_path,location
0,2,1,115,duran-duran,2023-08-31 19:00:00,NaT,,1,https://event.etix.com/ticket/p/48181933/,,0,DURAN DURAN: FUTURE PAST with special guests Bastille and Nile Rodgers & CHIC,grandstand,44.981997,-93.173279,Grandstand Building ➜ Grandstand Stage,Grandstand Stage


In [14]:
df_entertainment_events['id_event'].value_counts()

id_event
2       1
3377    1
3364    1
3365    1
3366    1
       ..
1716    1
1718    1
1720    1
1722    1
5137    1
Name: count, Length: 3828, dtype: int64

In [15]:
df_events = (
    df_entertainment_events
    .assign(date = lambda df_: df_['datetime_start'].dt.date)
    .assign(time = lambda df_: df_['datetime_start'].dt.time)
    .drop(columns=['id_entertainer', 'slug', 'tour_name', 'billing_order', 'seating_description', 'type', 'location_path']) #, 'datetime_start', 'datetime_end'])
    .rename(columns={'id_event': 'id', 'all_day_flag': 'isAllDay'})
    .astype({'id': 'object', 'id_location': 'object', 'isAllDay': 'boolean'})
)
df_events.dtypes

id                        object
id_location               object
datetime_start    datetime64[ns]
datetime_end      datetime64[ns]
ticket_url                object
isAllDay                 boolean
name                      object
latitude                 float64
longitude                float64
location                  object
date                      object
time                      object
dtype: object

In [16]:
df_events.groupby(['name', 'date'])['time'].agg(times=list).reset_index()

Unnamed: 0,name,date,times
0,204th Army Band Brass Quintet,2023-08-28,"[12:00:00, 13:00:00, 14:00:00]"
1,204th Army Band Brass Quintet,2023-08-29,"[12:00:00, 13:00:00, 14:00:00]"
2,50th Annual MSF Amateur Talent Contest,2023-08-24,[18:00:00]
3,50th Annual MSF Amateur Talent Contest,2023-08-25,[18:00:00]
4,50th Annual MSF Amateur Talent Contest,2023-08-26,[18:00:00]
5,50th Annual MSF Amateur Talent Contest,2023-08-27,[18:00:00]
6,50th Annual MSF Amateur Talent Contest,2023-08-28,[18:00:00]
7,50th Annual MSF Amateur Talent Contest,2023-08-29,[18:00:00]
8,50th Annual MSF Amateur Talent Contest,2023-08-30,[18:00:00]
9,50th Annual MSF Amateur Talent Contest,2023-08-31,[18:00:00]


In [16]:
df = df_events.query('name == "Timberworks Lumberjack Show"').sort_values('datetime_start')
df

Unnamed: 0,id,id_location,datetime_start,datetime_end,ticket_url,isAllDay,name,latitude,longitude,location,date,time
424,572,140,2023-08-24 12:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-24,12:00:00
425,573,140,2023-08-24 15:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-24,15:00:00
426,574,140,2023-08-24 17:30:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-24,17:30:00
449,597,140,2023-08-24 19:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-24,19:00:00
427,575,140,2023-08-25 12:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-25,12:00:00
438,586,140,2023-08-25 15:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-25,15:00:00
450,598,140,2023-08-25 17:30:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-25,17:30:00
461,610,140,2023-08-25 19:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-25,19:00:00
428,576,140,2023-08-26 12:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-26,12:00:00
439,587,140,2023-08-26 15:00:00,NaT,,False,Timberworks Lumberjack Show,44.985378,-93.168707,The North Woods Stage,2023-08-26,15:00:00


In [17]:
len(df_events.groupby(['name', 'location'])['id'].count().reset_index())

915

In [18]:
len(df_events.groupby(['name', 'location'])['id'].count().reset_index().drop_duplicates(subset=['name']))

906

In [19]:
print(df.groupby(['name', 'date'])['time'].agg(times=list).reset_index().to_json(orient='records', indent=2))


[
  {
    "name":"Timberworks Lumberjack Show",
    "date":1692835200000,
    "times":[
      "12:00:00",
      "15:00:00",
      "17:30:00",
      "19:00:00"
    ]
  },
  {
    "name":"Timberworks Lumberjack Show",
    "date":1692921600000,
    "times":[
      "12:00:00",
      "15:00:00",
      "17:30:00",
      "19:00:00"
    ]
  },
  {
    "name":"Timberworks Lumberjack Show",
    "date":1693008000000,
    "times":[
      "12:00:00",
      "15:00:00",
      "17:30:00",
      "19:00:00"
    ]
  },
  {
    "name":"Timberworks Lumberjack Show",
    "date":1693094400000,
    "times":[
      "12:00:00",
      "15:00:00",
      "17:30:00",
      "19:00:00"
    ]
  },
  {
    "name":"Timberworks Lumberjack Show",
    "date":1693180800000,
    "times":[
      "12:00:00",
      "15:00:00",
      "17:30:00",
      "19:00:00"
    ]
  },
  {
    "name":"Timberworks Lumberjack Show",
    "date":1693267200000,
    "times":[
      "12:00:00",
      "15:00:00",
      "17:30:00",
      "19:00:00"
 

In [21]:
print(dd.set_index('name').to_json(orient='table', indent=2))

NameError: name 'dd' is not defined

In [22]:
df_events = (
    df_entertainment_events
    .query('name == "Timberworks Lumberjack Show"')
    .drop(columns=['id_entertainer', 'slug', 'tour_name', 'billing_order', 'seating_description', 'type', 'location_path'])
    .rename(columns={'id_event': 'id', 'all_day_flag': 'isAllDay'})
    .astype({'id': 'object', 'id_location': 'object', 'isAllDay': 'boolean'})
    .assign(date = lambda df_:df_['datetime_start'].dt.date)
    .assign(time = lambda df_:df_['datetime_start'].dt.time)
)

#print(df_events.shape)
#display(df_events.sample(20))
    

df_events = (
    df_events
#    .merge(df_events.groupby(['name'])['datetime_start'].agg(datetime_starts=list).reset_index(), how='left', on='name')
    .merge(df_events.groupby(['name', 'date'])['time'].agg(times=list).reset_index(), how='left')
#    .drop_duplicates(subset=['name'])
    .drop(columns=['datetime_start', 'datetime_end', 'time'])
)

#print(df_events.shape)

#display(df_events.sample(20))
#display(df_events.dtypes)


location_columns = ['location', 'latitude', 'longitude']
df_event_locations = ( 
    df_events[['id_location'] + location_columns]
    .rename(columns={'id_location' : 'id'})
    .drop_duplicates(subset=['id'])
)
    
                               
df_events = (
    df_events.drop(columns=location_columns)
)

#display(df_events.head())
#print(df_events.shape)

#display(df_event_locations.head())
#print(df_event_locations.shape)

print(len(df_events))
df_events

48


Unnamed: 0,id,id_location,ticket_url,isAllDay,name,date,times
0,572,140,,False,Timberworks Lumberjack Show,2023-08-24,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
1,573,140,,False,Timberworks Lumberjack Show,2023-08-24,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
2,574,140,,False,Timberworks Lumberjack Show,2023-08-24,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
3,575,140,,False,Timberworks Lumberjack Show,2023-08-25,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
4,576,140,,False,Timberworks Lumberjack Show,2023-08-26,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
5,577,140,,False,Timberworks Lumberjack Show,2023-08-27,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
6,578,140,,False,Timberworks Lumberjack Show,2023-08-28,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
7,579,140,,False,Timberworks Lumberjack Show,2023-08-29,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
8,580,140,,False,Timberworks Lumberjack Show,2023-08-30,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
9,581,140,,False,Timberworks Lumberjack Show,2023-08-31,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"


In [23]:
df_events.groupby(['name', 'date'])['times'].agg(ttt=list).reset_index() # .sample(50)

Unnamed: 0,name,date,ttt
0,Timberworks Lumberjack Show,2023-08-24,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
1,Timberworks Lumberjack Show,2023-08-25,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
2,Timberworks Lumberjack Show,2023-08-26,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
3,Timberworks Lumberjack Show,2023-08-27,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
4,Timberworks Lumberjack Show,2023-08-28,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
5,Timberworks Lumberjack Show,2023-08-29,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
6,Timberworks Lumberjack Show,2023-08-30,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
7,Timberworks Lumberjack Show,2023-08-31,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
8,Timberworks Lumberjack Show,2023-09-01,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."
9,Timberworks Lumberjack Show,2023-09-02,"[[12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, 15:00:00, 17:30:00, 19:00:00], [12:00:00, ..."


In [25]:
df_events

Unnamed: 0,id,id_location,ticket_url,isAllDay,name,date,times
0,572,140,,False,Timberworks Lumberjack Show,2023-08-24,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
1,573,140,,False,Timberworks Lumberjack Show,2023-08-24,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
2,574,140,,False,Timberworks Lumberjack Show,2023-08-24,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
3,575,140,,False,Timberworks Lumberjack Show,2023-08-25,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
4,576,140,,False,Timberworks Lumberjack Show,2023-08-26,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
5,577,140,,False,Timberworks Lumberjack Show,2023-08-27,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
6,578,140,,False,Timberworks Lumberjack Show,2023-08-28,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
7,579,140,,False,Timberworks Lumberjack Show,2023-08-29,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
8,580,140,,False,Timberworks Lumberjack Show,2023-08-30,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"
9,581,140,,False,Timberworks Lumberjack Show,2023-08-31,"[12:00:00, 15:00:00, 17:30:00, 19:00:00]"


In [24]:
df_events.name.value_counts()

name
Timberworks Lumberjack Show    48
Name: count, dtype: int64

In [25]:
df_events[df_events['location'] == "O'Gara's at the Fair"]

KeyError: 'location'

# Vendors New This Year

In [39]:
df_new_vendors = pd.read_csv('./csv/vendorNewThisYear.csv', dtype={'vendor_id': 'object'})
display(df_new_vendors.head(1))

columns = ['vendor_id', 'name', 'excerpt', 'description']
df_new_vendors = df_new_vendors[columns]
df_new_vendors.head(1)

Unnamed: 0,id,vendor_id,account_code,exhibitor_id,site_num,branch_num,type,sort_order,name,slug,excerpt,description,directions,notes
0,1,5146.1,5146,20887,1,1,food,1,Al Taco Baba,al-taco-baba,"Traditional hummus, harissa barbacoa, corn, queso fresco, chili dust, shatta (hot sauce), crema,...","Traditional hummus, harissa barbacoa, corn, queso fresco, chili dust, shatta (hot sauce), crema,...",,


Unnamed: 0,vendor_id,name,excerpt,description
0,5146.1,Al Taco Baba,"Traditional hummus, harissa barbacoa, corn, queso fresco, chili dust, shatta (hot sauce), crema,...","Traditional hummus, harissa barbacoa, corn, queso fresco, chili dust, shatta (hot sauce), crema,..."


In [40]:
df_new_vendors.dtypes

vendor_id      object
name           object
excerpt        object
description    object
dtype: object

In [41]:
df_new_vendors.columns

Index(['vendor_id', 'name', 'excerpt', 'description'], dtype='object')

In [42]:
df_new_vendors[df_new_vendors['excerpt'] != df_new_vendors['description']]

Unnamed: 0,vendor_id,name,excerpt,description
37,2764.1,New Vendor,"MomoDosa serves official new food Chicken Momo With Tomato Chutney (blend of ground chicken, cab...","MomoDosa serves official new food Chicken Momo With Tomato Chutney (blend of ground chicken, cab..."
38,8386.1,New Vendor,"Peachey’s Baking Company serves Amish doughnuts made on-site using traditional Amish recipes, to...","Peachey’s Baking Company serves Amish doughnuts made on-site using old Amish recipes, topped wit..."


# Vendor Categories

In [43]:
df_vendor_categories = pd.read_csv('./csv/vendorCategories.csv')
df_vendor_categories

Unnamed: 0,id_cat,category,has_merchandise,has_food
0,9,Media,1,0
1,21,Political,1,0
2,40,Beer & Wine,0,1
3,41,New Vendor,1,1
4,42,Location Change,0,1
5,61,On-A-Stick,0,1
6,63,Open Early,0,1
7,70,Blue Ribbon Bargain Book,1,1
8,71,"Opening Day, Aug. 24",1,1
9,72,"Seniors Day, Aug. 28 & 31",1,1


# Vendors

In [44]:
df_vendors = pd.read_csv('./csv/vendors.csv', dtype={'id':'object'})
df_vendors.query('name == "Mazda"')

Unnamed: 0,id,exhibitor_id,account_code,lic_num,account_branch_id,license_branch_id,site_num,branch_num,name,slug,...,alcoholic_beverages,license_text,products,keywords,directions,latitude,longitude,is_food,categories,promotions
738,4631.1,20702,4631,4631,4631-1,4631-1,1,1,Mazda,4631.1,...,,"Promotion and display of Mazda vehicles, photo ops and giveaways. Sponsor of the Nightly Firewor...","Promotion and display of Mazda vehicles, photo ops and giveaways. Sponsor of the Nightly Firewor...",,Outside southwest corner of the Grandstand,44.98106,-93.174351,0,"[1, 10]",[]


In [45]:
df_vendors['name'].value_counts().head()

name
Pronto Pups                   8
Dandy Souvenirs               8
About a Foot Long Hot Dog     5
HomeTown Mobility Rental      5
Bridge n' Barrel Root Beer    4
Name: count, dtype: int64

In [34]:
print(df_vendors.shape)
print(df_vendors.dtypes)

(973, 22)
id                         object
exhibitor_id                int64
account_code                int64
lic_num                     int64
account_branch_id          object
license_branch_id          object
site_num                    int64
branch_num                  int64
name                       object
slug                      float64
description                object
additional_information     object
alcoholic_beverages        object
license_text               object
products                   object
keywords                   object
directions                 object
latitude                  float64
longitude                 float64
is_food                     int64
categories                 object
promotions                 object
dtype: object


In [46]:
len(df_vendors['name'].unique())

918

In [47]:
# description - not useful
df_vendors['description'].unique()[:20]

array(['Open 9 a.m. to 10 p.m. (9 p.m. Labor Day)\nAccepts credit cards',
       nan,
       'Open 9 a.m. to 9 p.m. (8 p.m. Labor Day)\nAccepts credit cards\nVegan options (Irish Apple Tipsy Pie)\nGluten-friendly options (Minty Magic & Irish Pecan tarts; please confirm with vendor)',
       'Open 7 a.m. to 9 p.m.\nAccepts credit cards\nVegan options (non-dairy milk upon request)\nGluten-friendly options (banana chocolate chip muffin, scotcharoo bar; please confirm with vendor)',
       'Open 7 a.m. to 10:30 p.m. (9 p.m. Labor Day)\nAccepts credit cards',
       "Proud sponsor of Schell's Stage\nOpen 7 a.m. to 10:30 p.m. (9 p.m. Labor Day)\nAccepts credit cards",
       'Open 9 a.m. to 11 p.m. (9 p.m. Labor Day)\nAccepts credit cards\nVegan options (falafel)\nVegetarian options (tirokroketes, feta bites, french fries)',
       'Open 7 a.m. to 9 p.m.\nAccepts credit cards',
       'Open 9 a.m. to 10 p.m. (9 p.m. Labor Day)\nCash only',
       'Open 7:30 a.m. to 10:30 p.m. (9 p.m. Labor D

In [48]:
# new_this_year - not useful
df_vendors['new_this_year'].unique()

KeyError: 'new_this_year'

In [49]:
# promotions - not really needed
df_vendors['promotions'].unique()[:20]

array(['[]',
       "{'Blue Ribbon Bargain Book': [{'offer': '$3 off one regular-priced hand pie (Regularly $9)', 'category': 'Blue Ribbon Bargain Book'}], 'Giveaways (Free)': [{'offer': 'Buttons with purchase', 'category': 'Giveaways (Free)'}]}",
       "{'Blue Ribbon Bargain Book': [{'offer': '$2 off one sno cone (Regularly $5)', 'category': 'Blue Ribbon Bargain Book'}]}",
       "{'Blue Ribbon Bargain Book': [{'offer': '$2 off one order of popcorn shrimp (Regularly $6) OR $1 off one small fresh-squeezed (shaken, not stirred) lemonade (Regularly $5)', 'category': 'Blue Ribbon Bargain Book'}]}",
       "{'Blue Ribbon Bargain Book': [{'offer': '$3 off one Official Minnesota State Fair Commemorative Art Poster (Regularly $10)', 'category': 'Blue Ribbon Bargain Book'}]}",
       "{'Blue Ribbon Bargain Book': [{'offer': '$3 off one Island Lemonade with Hawaiian POG\\njuice (passion fruit, orange and guava)\\n(Regularly $9)\\n', 'category': 'Blue Ribbon Bargain Book'}]}",
       "{'Prize D

In [50]:
cols = ['id', 'name', 'alcoholic_beverages', 'license_text', 'keywords', 'directions', 'latitude', 'longitude', 'is_food', 'categories']
df_vendors_filter_columns = (
    df_vendors
    .assign(name = lambda df_:df_['name'].str.strip())
    .loc[:, cols]
    .fillna('')
)

print(df_vendors_filter_columns.shape)
df_vendors_filter_columns.head(5)

(973, 10)


Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories
0,1023.1,Pronto Pups,,"bottled water, Coca-Cola, Diet Coke, Sprite, Pronto Pup on-a-stick (6"" flour-battered deep-fried...","prontopup, pronto-pup, pronto pup, prnto pup, pronto up, waterbottle, bottle, coke, cola, soda, ...",West side of Underwood St. between Dan Patch & Carnes avenues,44.980559,-93.170795,1,"[2, 34, 37, 61]"
1,1286.1,West Indies Soul Food,,"Turmeric Ginger Lemon Surprise (fresh ginger, turmeric syrup, dash of bitters, Original Caribbea...","ice tea, icetea, icedtea, iced-tea, ice-tea, sweettea, sweet-tea, waterbottle, bottle, ice tea, ...","At the International Bazaar, south wall",44.978029,-93.169055,1,"[2, 34, 38, 61]"
2,1288.1,Sara's Tipsy Pies,,"Tipsy Pecan Tart (gluten-friendly pie infused with alcohol), coffee, Minty Magic Tart (gluten-fr...","saras, sarah's, tipsypie, sarastipsy, tipsee, tispy, breakfast, gluten-free, gluten free, new, w...","In the Food Building, northwest wall",44.98048,-93.169951,1,"[2, 34, 70, 76]"
3,1328.1,The Anchor Coffee House,,"gluten-free banana chocolate chip muffins, cold brew coffee, nitro cold brew coffee, Gluten frie...","breakfast, gluten free, gluten-free",West side of Underwood St. between Dan Patch and Carnes avenues,44.980515,-93.17077,1,"[2, 63]"
4,1416.1,LuLu's Public House,Arnold Palmer Spiked Raspberry Slushy\nBack Channel Edna's IPA\nBauhaus Nah Blonde (Non-Alcoholi...,"beer, Schell's Oktoberfest Infused Brat (Oktoberfest infused fried onions, brat bun), Southern C...","lulus, lu lu, looloos, loo loo, lu-lu, breakfast, new, walleye tacos, all beef, allbeef, tater t...","At West End Market, south of Schilling Amphitheater",44.981353,-93.177652,1,"[2, 34, 35, 38, 39, 40, 61, 63]"


In [51]:
df_vendors_filter_columns.dtypes

id                     object
name                   object
alcoholic_beverages    object
license_text           object
keywords               object
directions             object
latitude               object
longitude              object
is_food                 int64
categories             object
dtype: object

In [52]:
# 0 = not food, don't know what the other values represent
df_vendors_filter_columns['is_food'].value_counts()

is_food
0    698
1    275
Name: count, dtype: int64

In [53]:
df_vendors_filter_columns.query('is_food == 5')

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories


In [54]:
df_vendors_filter_columns.query('is_food == 2').head(10)

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories


In [55]:
df_vendors_filter_columns['categories'].value_counts()

categories
[1, 38]                                96
[1]                                    81
[1, 34, 38]                            78
[1, 34]                                52
[3, 1]                                 48
[2, 34]                                39
[4, 1, 38]                             29
[1, 8]                                 26
[2, 34, 61]                            24
[2]                                    19
[1, 41]                                18
[2, 34, 70]                            17
[2, 34, 63]                            15
[2, 61]                                13
[1, 8, 34]                             12
[1, 34, 38, 70]                        11
[2, 63]                                11
[3, 1, 10]                             11
[1, 70]                                 9
[3, 1, 38]                              9
[2, 34, 63, 70]                         8
[1, 42]                                 8
[4, 1]                                  8
[2, 70]                

In [56]:
# category 41 : New Food/Flavor
mask = df_vendors_filter_columns['categories'].apply(lambda x: '41' in x)
print(df_vendors_filter_columns[mask].shape)
df_vendors_filter_columns[mask]

(71, 10)


Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories
384,2764.1,Midtown Global Market's MomoDosa,,"Chicken Momo With Tomato Chutney (ground chicken, cabbage, onion, ginger, steamed in a dough wra...","new, waterbottle, bottle, waterbottle, bottle","In the Taste of the Midtown Global Market booth at the International Bazaar, east wall",44.978206,-93.168495,1,"[2, 34, 35, 41]"
510,3089.2,The Perfect Pickle,,"pickles on-a-stick, deep-fried pickles (Cajun style, optional cream cheese), Pepsi, Diet Pepsi, ...","deep fried pickles, soda, pop, cola, soda, pop, cola, diet soda, diet pop, sierra mist, lemon, l...",West side of Underwood Street at Lee Avenue,44.985687,-93.17073,1,"[2, 41, 61]"
824,5733.3,Bridge n' Barrel Root Beer,,"Coca-Cola, Diet Coke, Sprite, bottled water, Lift Bridge Root Beer, Lift Bridge Black Cherry Sod...","bridge and barrel, brdige n barrel, rootbeer, barrell, barel, new, coke, cola, soda, pop, coke, ...",North side of Dan Patch Ave. between Underwood and Nelson streets,44.98133,-93.171214,1,"[2, 39, 41, 70]"
825,5733.4,Bridge n' Barrel Root Beer,,"Coca-Cola, Diet Coke, Sprite, bottled water, Lift Bridge Root Beer, Lift Bridge Black Cherry Sod...","bridge and barrel, brdige n barrel, rootbeer, barrell, barel, new, coke, cola, soda, pop, coke, ...",Northwest corner of Judson Avenue and Underwood Street,44.978778,-93.170758,1,"[2, 39, 41, 70]"
826,5741.1,Mickman Brothers,,full service landscape design and installation company,,"In the Home Improvement Building, east side",44.981876,-93.16832,0,"[3, 1, 22, 41]"
866,6041.1,Bandstand Concessions,Barrel Theory Rider Request\nCannon River Feisty B Red\nCannon River Gunflint White\nCoors Light...,"beer, Minnesota wine, nachos, popcorn, peanuts, bottled water, Facepunch Pretzels, candy (Trolli...","new, waterbottle, bottle, soda, pop, cola, soda, pop, cola, diet soda, diet pop, sierra mist, le...",Inside the Grandstand Concert Venue,44.98196,-93.172812,1,"[2, 34, 38, 40, 41]"
872,6202.1,Wow Fudge,,"bottled water, handmade copper kettle gourmet fudge in 70+ flavors","new, waterbottle, bottle, waterbottle, bottle","Located in the Creative Activities Annex, south wall",44.981973,-93.167568,1,"[2, 41]"
873,6288.1,Mattress Firm,,"Promotion and sale of Tempur/Sealy, Simmons/Serta, King Coil, and Purple mattresses, bases, and ...",,,44.984217,-93.168888,0,"[1, 10, 41]"
874,6292.1,Black Tie Caramel,,"caramel sauces, caramel candies, caramel crunch",,In Warner Coliseum,44.978233,-93.17436,0,"[1, 34, 41]"
875,6295.1,Resurrected Journals,,"bookmarks, book tote bags, book journals, book earrings, book coasters",,"In the Grandstand, upper level, east center section",44.98141,-93.173227,0,"[1, 41]"


In [57]:
# license_text - used for list of foods
vendor_id = '364.1'
df = df_vendors_filter_columns.query('id == @vendor_id')
display(df)

s = df['license_text'].iloc[0]
print(s)

re.split(',\s*(?![^()]*\))', s)

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories


IndexError: single positional indexer is out-of-bounds

In [58]:
# license_text - used for list of foods
vendor_id = '809.1'
df = df_vendors_filter_columns.query('id == @vendor_id')
display(df)

s = df['license_text'].iloc[0]
print(s)

re.split(',\s*(?![^()]*\))', s)

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories


IndexError: single positional indexer is out-of-bounds

# Drinks

In [59]:
s = df_vendors_filter_columns['alcoholic_beverages'].str.len().value_counts()
display(s)

alcoholic_beverages
0       943
405       2
393       2
648       1
763       1
234       1
907       1
410       1
436       1
646       1
930       1
1371      1
343       1
179       1
573       1
495       1
510       1
438       1
975       1
536       1
693       1
364       1
974       1
781       1
408       1
734       1
500       1
413       1
245       1
Name: count, dtype: int64

In [60]:
s = df_vendors_filter_columns['alcoholic_beverages'].str.len()
df_drink_vendors = df_vendors_filter_columns[s!=0]

print(df_drink_vendors.shape)
display(df_drink_vendors.head())

(30, 10)


Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories
4,1416.1,LuLu's Public House,Arnold Palmer Spiked Raspberry Slushy\nBack Channel Edna's IPA\nBauhaus Nah Blonde (Non-Alcoholi...,"beer, Schell's Oktoberfest Infused Brat (Oktoberfest infused fried onions, brat bun), Southern C...","lulus, lu lu, looloos, loo loo, lu-lu, breakfast, new, walleye tacos, all beef, allbeef, tater t...","At West End Market, south of Schilling Amphitheater",44.981353,-93.177652,1,"[2, 34, 35, 38, 39, 40, 61, 63]"
5,1416.2,Schell's Booth,Grain Belt Blu\nGrain Belt Blu Frozen Topper\nGrain Belt Nordeast\nGrain Belt Premium\nSchell's ...,"beer, Schell's beer","\nshells, schels, schells, shells, schels, schells, shells, schels, schells","At West End Market, west section",44.981348,-93.177962,1,"[2, 38, 39, 40]"
7,1457.1,Dino's Gyros,Athletic Lite Non-Alcoholic Beer\nBent Brewstillery Greek Cookie (NEW)\nBomba Bull Frose All Day...,"gyros, spicy gyros, meal combos (french fries, Greek fries, soft drinks), chicken gyros, Pepsi, ...","dinos, giros, gyro, soda, pop, cola, soda, pop, sierra mist, lemon, lime, soda, pop, soda, pop, ...",North side of Carnes Ave. between Nelson & Underwood streets,44.980053,-93.171069,1,"[2, 34, 38, 40]"
49,1872.1,Andy's Grille,Bent Paddle Summer Luv'n Orange IPA\nBig Wood Caramel Apple Craft Seltzer\nBig Wood Chocolate Ch...,"hamburgers, cheeseburgers, Philly Fries (Sidewinder fries, Philly steak meat, sautéed onions, gr...","andys, breakfast, grill, rootbeer, soda, pop, coke, cola, soda, pop, waterbottle, bottle, coke,...",South side of Carnes Ave. between Chambers & Clough streets,44.979865,-93.174245,1,"[2, 34, 38, 39, 40, 63]"
78,1968.1,The Blue Barn,"B.F.F. Hazy (A collaboration among Bauhaus, Forgotten Star and The Freehouse) (NEW)\nFour Daught...","Blue Cheese & Corn Fritz, Chicken in the Waffle (homemade-style waffle shell, crispy chicken ten...","bluebarn, blu barn, blueb arn, blue barn, breakfast, new, coke, cola, soda, pop, diet coca-cola,...","At West End Market, south of the History & Heritage Center",44.981047,-93.176232,1,"[2, 34, 39, 40, 61, 63]"


In [61]:
# of locations that serve alcoholic beverages
len(df_drink_vendors['name'].unique())

30

In [62]:
alcoholic_beverages = df_drink_vendors['alcoholic_beverages'].str.split('\x0b').explode()
print(len(alcoholic_beverages.unique()))

alcoholic_beverages.sample(25)

28


155    Arnold Palmer Spiked Slushy\nBald Man Island in the Sun IPA (NEW)\nBell's Two Hearted Ale\nBudwe...
924    Busch Light\nCoors Light\nFirestone Walker Mind Haze IPA\nGray Duck Bomba Juice Hard Seltzer\nKo...
551    Bauhaus Brew Labs x Animales BBQ Co. Sun Seared Grilled Lemon Blonde Ale (NEW)\nBauhaus Brew Lab...
358    Castle Danger Cream Ale\nLeinenkugel's Berry Weiss\nLeinenkugel's Honey Lemon Light\nLeinenkugel...
321    Cannon River Gunflint Red\nCannon River Minnesota St. Pepin\nCastle Danger Blood Orange Cream Al...
866    Barrel Theory Rider Request\nCannon River Feisty B Red\nCannon River Gunflint White\nCoors Light...
709    Arnold Palmer Spiked\nBent Brewstillery Cheers to Cherry Pie\nBent Brewstillery Hideaway Sunset\...
49     Bent Paddle Summer Luv'n Orange IPA\nBig Wood Caramel Apple Craft Seltzer\nBig Wood Chocolate Ch...
235    Bell's Two Hearted Ale\nBlue Moon\nBurning Brothers Pyro (gluten-free) \nCoors Light\nDeschutes ...
121    Castle Danger Cream Ale\nCoors

In [None]:
alcoholic_beverages[alcoholic_beverages.str.startswith('Big Wood Chocolate Chip Cookie Beer')]

In [None]:
alcoholic_beverages[alcoholic_beverages.str.startswith('Castle')]

In [76]:
dd = df_vendors_filter_columns[df_vendors_filter_columns['alcoholic_beverages'].str.len() != 0]
len(dd)


30

In [78]:
df_drinks = None
df_drinks = (
    df_vendors_filter_columns[df_vendors_filter_columns['alcoholic_beverages'].str.len() != 0]

    # .assign(drinkName      = lambda df_:df_['alcoholic_beverages'].str.split('\x0b')).explode('drinkName')
    .assign(drinkName      = lambda df_:df_['alcoholic_beverages'].str.split('\n')).explode('drinkName')
    .assign(isNew          = lambda df_:df_['drinkName'].str.contains('NEW'))
    .assign(isOnlyAtFair   = lambda df_:df_['drinkName'].str.contains('ONLY AT THE FAIR'))

    .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('(NEW)', '', regex=False))
    .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('(ONLY AT THE FAIR)', '', regex=False))
    .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('’', "'", regex=False))
             
    .assign(drinkName       = lambda df_:df_['drinkName'].str.strip())
             
    .drop(columns=['alcoholic_beverages', 'license_text', 'keywords', 'is_food', 'categories'])
)

len(df_drinks)

610

In [80]:
df_drinks = None
df_drinks = (
    df_vendors_filter_columns[df_vendors_filter_columns['alcoholic_beverages'].str.len() != 0]

    # .assign(drinkName      = lambda df_:df_['alcoholic_beverages'].str.split('\x0b')).explode('drinkName')
    .assign(drinkName      = lambda df_:df_['alcoholic_beverages'].str.split('\n')).explode('drinkName')
    .assign(isNew          = lambda df_:df_['drinkName'].str.contains('NEW'))
    .assign(isOnlyAtFair   = lambda df_:df_['drinkName'].str.contains('ONLY AT THE FAIR'))

    .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('(NEW)', '', regex=False))
    .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('(ONLY AT THE FAIR)', '', regex=False))
    .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('’', "'", regex=False))
             
    .assign(drinkName       = lambda df_:df_['drinkName'].str.strip())
             
    .drop(columns=['alcoholic_beverages', 'license_text', 'keywords', 'is_food', 'categories'])
)

drinks_to_exclude = []
drinks_to_exclude += ['Blue Moon', 'Bud Light', 'Budweiser', 'Busch Light', 'Coors Light', 'Corona']
drinks_to_exclude += ['Dos Equis', 'Pabst Blue Ribbon', 'Pacifico', 'Heineken', 'Guinness', 'Stella Artois', 'Miller Lite', 'Michelob Golden Draft Light']
cond1 = df_drinks['isNew'] == True
cond2 = df_drinks['isOnlyAtFair'] == True
cond3 = df_drinks.loc[:, 'drinkName'].str.contains(r'\b(?:{})\b'.format('|'.join(drinks_to_exclude)))
df_drinks = df_drinks[cond1 | cond2 | ~cond3] 
df_drinks = df_drinks.merge(df_drinks.groupby('drinkName')['id'].agg(vendorIDs = list).reset_index(), how='left', on='drinkName')


with open('./json/db.json', 'w') as json_file:
    
    # ---------- DRINKS ----------
    df1 = (df_drinks
           .drop_duplicates(subset='drinkName')[['drinkName', 'isNew', 'isOnlyAtFair', 'vendorIDs']]
           .reset_index()
           .rename(columns={'index': 'id', 'drinkName': 'name'})
           .assign(id = lambda df_:df_['id'].add(1000).astype(str))
          )
    #df1.to_json('./json/drinks.json', orient='records')
    
    # ---------- FOODS ----------
    df2 = (df_new_vendors[['vendor_id', 'name', 'excerpt', 'description']]
           .reset_index()
           .assign(vendor_id = lambda df_:df_['vendor_id'].str.split())
           .rename(columns={'index': 'id', 'vendor_id': 'vendorIDs'})
           .assign(id = lambda df_:df_['id'].add(2000).astype(str))
        )
    
    # ---------- VENDORS ----------
    df3 = df_vendors_filter_columns[['id', 'name', 'directions', 'latitude', 'longitude']]
    #df3.to_json('./json/drink_vendors.json', orient='records')

    
    df_item_to_vendors = pd.concat([df1, df2])[['id', 'vendorIDs']]
    df_vendor_to_items = df_item_to_vendors.explode('vendorIDs').groupby('vendorIDs')['id'].agg(itemIDs=list).reset_index().rename(columns={'vendorIDs':'id'})

    df1 = df1.drop(columns=['vendorIDs'])
    df2 = df2.drop(columns=['vendorIDs'])

    js = {
            'drinks'        : json.loads(df1.to_json(orient='records')),
            'foods'         : json.loads(df2.to_json(orient='records')),
            'vendors'       : json.loads(df3.to_json(orient='records')),
            'itemToVendors' : json.loads(df_item_to_vendors.to_json(orient='records')),
            'vendorToItems' : json.loads(df_vendor_to_items.to_json(orient='records'))
         }
    json.dump(js, json_file, indent=4)

In [81]:
df_item_to_vendors.head()

Unnamed: 0,id,vendorIDs
0,1000,[1416.1]
1,1001,[1416.1]
2,1002,[1416.1]
3,1003,[1416.1]
4,1004,[1416.1]


In [65]:
#df_drinks = df_drinks.merge(df_drinks.groupby('drinkName')['id'].agg(vendorIDs = list).reset_index(), how='left', on='drinkName')

df_item_to_vendors.explode('vendorIDs').groupby('vendorIDs')['itemID'].agg(itemIDs=list).reset_index().rename(columns={'vendorIDs':'vendorID')
                                                                                                    
                                                                                            

SyntaxError: closing parenthesis ')' does not match opening parenthesis '{' (3375327869.py, line 3)

In [66]:
df = (
    df_item_to_vendors.head()
    .assign(vendorID= lambda df_:df_['vendorIDs'].explode('vendorID'))
)
df

Unnamed: 0,id,vendorIDs,vendorID
0,1000,[1416.1],1416.1
1,1001,[1416.2],1416.2
2,1002,[1457.1],1457.1
3,1003,[1872.1],1872.1
4,1004,[1968.1],1968.1


In [67]:
df_drinks = None
df_drinks = (
    df_vendors_filter_columns[df_vendors_filter_columns['alcoholic_beverages'].str.len() != 0].head()

     .assign(drinkName      = lambda df_:df_['alcoholic_beverages'].str.split('\x0b')).explode('drinkName')
#     .assign(isNew          = lambda df_:df_['drinkName'].str.contains('NEW'))
#     .assign(isOnlyAtFair   = lambda df_:df_['drinkName'].str.contains('ONLY AT THE FAIR'))

#     .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('(NEW)', '', regex=False))
#     .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('(ONLY AT THE FAIR)', '', regex=False))
#     .assign(drinkName       = lambda df_:df_['drinkName'].str.replace('’', "'", regex=False))
             
#     .assign(drinkName       = lambda df_:df_['drinkName'].str.strip())
             
#     .drop(columns=['alcoholic_beverages', 'license_text', 'keywords', 'is_food', 'categories'])
)
df_drinks

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories,drinkName
4,1416.1,LuLu's Public House,Arnold Palmer Spiked Raspberry Slushy\nBack Channel Edna's IPA\nBauhaus Nah Blonde (Non-Alcoholi...,"beer, Schell's Oktoberfest Infused Brat (Oktoberfest infused fried onions, brat bun), Southern C...","lulus, lu lu, looloos, loo loo, lu-lu, breakfast, new, walleye tacos, all beef, allbeef, tater t...","At West End Market, south of Schilling Amphitheater",44.981353,-93.177652,1,"[2, 34, 35, 38, 39, 40, 61, 63]",Arnold Palmer Spiked Raspberry Slushy\nBack Channel Edna's IPA\nBauhaus Nah Blonde (Non-Alcoholi...
5,1416.2,Schell's Booth,Grain Belt Blu\nGrain Belt Blu Frozen Topper\nGrain Belt Nordeast\nGrain Belt Premium\nSchell's ...,"beer, Schell's beer","\nshells, schels, schells, shells, schels, schells, shells, schels, schells","At West End Market, west section",44.981348,-93.177962,1,"[2, 38, 39, 40]",Grain Belt Blu\nGrain Belt Blu Frozen Topper\nGrain Belt Nordeast\nGrain Belt Premium\nSchell's ...
7,1457.1,Dino's Gyros,Athletic Lite Non-Alcoholic Beer\nBent Brewstillery Greek Cookie (NEW)\nBomba Bull Frose All Day...,"gyros, spicy gyros, meal combos (french fries, Greek fries, soft drinks), chicken gyros, Pepsi, ...","dinos, giros, gyro, soda, pop, cola, soda, pop, sierra mist, lemon, lime, soda, pop, soda, pop, ...",North side of Carnes Ave. between Nelson & Underwood streets,44.980053,-93.171069,1,"[2, 34, 38, 40]",Athletic Lite Non-Alcoholic Beer\nBent Brewstillery Greek Cookie (NEW)\nBomba Bull Frose All Day...
49,1872.1,Andy's Grille,Bent Paddle Summer Luv'n Orange IPA\nBig Wood Caramel Apple Craft Seltzer\nBig Wood Chocolate Ch...,"hamburgers, cheeseburgers, Philly Fries (Sidewinder fries, Philly steak meat, sautéed onions, gr...","andys, breakfast, grill, rootbeer, soda, pop, coke, cola, soda, pop, waterbottle, bottle, coke,...",South side of Carnes Ave. between Chambers & Clough streets,44.979865,-93.174245,1,"[2, 34, 38, 39, 40, 63]",Bent Paddle Summer Luv'n Orange IPA\nBig Wood Caramel Apple Craft Seltzer\nBig Wood Chocolate Ch...
78,1968.1,The Blue Barn,"B.F.F. Hazy (A collaboration among Bauhaus, Forgotten Star and The Freehouse) (NEW)\nFour Daught...","Blue Cheese & Corn Fritz, Chicken in the Waffle (homemade-style waffle shell, crispy chicken ten...","bluebarn, blu barn, blueb arn, blue barn, breakfast, new, coke, cola, soda, pop, diet coca-cola,...","At West End Market, south of the History & Heritage Center",44.981047,-93.176232,1,"[2, 34, 39, 40, 61, 63]","B.F.F. Hazy (A collaboration among Bauhaus, Forgotten Star and The Freehouse) (NEW)\nFour Daught..."


In [None]:
df1.head() 

In [None]:
df2.head()

In [None]:
df3.head()

In [None]:
df_vendors_filter_columns.head()

In [None]:
df_new_vendors.head()

In [None]:
df_drinks.head()

In [None]:
df_drinks.merge(df_drinks.groupby('drinkName')['id'].agg(vendor_ids = list).reset_index(), how='left', on='drinkName')

In [None]:
df_drinks.sample(20)

In [None]:
df_drinks['drinkName'].value_counts().head()

In [None]:
df_drinks.query('drinkName == "Grain Belt Premium"')

In [None]:
df_drinks.groupby('drinkName')['id'].agg(vendor_ids=list)#.reset_index().query('drinkName == "Grain Belt Premium"')

In [68]:
# Watch out for trailing spaces
df_drinks.query('drinkName == "Castle Danger Orange Cream Ale"')

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories,drinkName


In [69]:
# Watch out for trailing spaces - this should be fixed by includeing the space when replacing (NEW) and (ONLY AT THE FAIR)
df_drinks.query('drinkName == "Castle Danger Orange Cream Ale "')

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories,drinkName


In [70]:
df_drinks.query('drinkName == "Big Wood Chocolate Chip Cookie Beer"')

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories,drinkName


In [71]:
df_drinks.query('drinkName == "Big Wood Chocolate Chip Cookie Beer "')

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories,drinkName


In [72]:
cond = df_drinks['drinkName'].str.startswith('Castle')
df_drinks[cond]

Unnamed: 0,id,name,alcoholic_beverages,license_text,keywords,directions,latitude,longitude,is_food,categories,drinkName


In [73]:
df_drinks[['isNew', 'isOnlyAtFair']].sum()

KeyError: "None of [Index(['isNew', 'isOnlyAtFair'], dtype='object')] are in the [columns]"

In [None]:
# https://www.mnstatefair.org/general-info/specialty-brews-and-beverages/

In [74]:
df_drinks[df_drinks['isOnlyAtFair']].drop_duplicates('drinkName')['drinkName'].sort_values()

KeyError: 'isOnlyAtFair'

In [None]:
df_drinks[df_drinks['isNew']].drop_duplicates('drinkName')['drinkName'].sort_values()

In [None]:
df_drinks.query('(isNew ==True)').drop_duplicates(subset='drinkName').sort_values('drinkName')

# Foods

In [None]:
exclude = ['bottled water', 'Diet Coke', 'Coca-Cola', 'Sprite', 'Pepsi', 'Diet Pepsi', 'Milk', 'Coffee', 'Beer', 'Root Beer', 'Sierra Mist', 'Dr Pepper', 'Iced Tea', 'Mountain Dew', 'Mello Yello',
           'Diet Mountain Dew', 'Powerade', 'Orange Juice', 'Pink Lemonade', 'Cherry Coke', 'Lemonade', 'Fresh-squeezed lemonade', '7-UP', 'Fanta Orange', 'Barq\'s Root Beer',
           'Orange soda', 'Hot Chocolate', '', 'Gatorade', 'Orange Drink', 'Juice', 'Mug Root Beer', 'Chocolate Milk', 'Brewed malt beverages', 'Glow sticks', 'Cheese corn',
           'Bacon Cheeseburgers', 'Hamburgers', 'Hot dogs', 'ice cream cones', 'Foot long hot dogs', 'Caramel corn', 'root beer floats', 'Floats', 'Orange floats', 'Nachos',
           'popcorn', 'Cheeseburgers', 'French Fries', 'sundaes', 'Ice Cream', 'Minnesota wine', 'Grape soda', 'Eggs', 'Lattes', 'Corn Dogs', 'Corn dogs - 12"', 'espresso', 'Sausage',
           'Sparkling water', 'Cappuccino', 'Fruit Punch', 'BBQ chips', 'dinners (beans, coleslaw)', 'Iced coffee', "Schell's beer", 'Red Bull slushes', 'honey sriracha sauce'
          ]
df_foods = (
    df_vendors_filter_columns.query('is_food != 0')
    #df_vendors_filter_columns.query('is_food == 2')
    .assign(food=     lambda df_:df_['license_text'].str.split(',s*(?![^()]*\))')).explode('food')  
    .assign(food=     lambda df_:df_['food'].str.strip())
    #.drop(columns=['alcoholic_beverages'])
    .drop(columns=['license_text', 'keywords', 'is_food'])
    #.drop(columns=['categories'])
)

cond = df_foods['categories'].apply(lambda x: '41' in x)
df_foods = df_foods[cond]

cond = ~df_foods['food'].isin(exclude)
df_foods = df_foods[cond]

print(df_foods.shape)    
    
#['food'].str.strip() #.value_counts().drop(labels=exclude).head(50)
#foods= df_food.assign(food=df_food['license_text'].str.split(',s*(?![^()]*\))')).explode('food')['food'].str.strip().reset_index(drop=True)
#s = foods.str.len().sort_values(ascending=False).index
#foods.reindex(s).head(250).sort_values()

df_foods.sample(20)

In [None]:
cond1 = df_vendors_filter_columns['is_food'] != 0
cond2 = df_vendors_filter_columns['categories'].apply(lambda x: '41' in x)
df_new_food = df_vendors_filter_columns[cond1 & cond2]
s = df_new_food.assign(food=df_food['license_text'].str.split(',s*(?![^()]*\))')).explode('food')['food'].str.strip().value_counts()#.drop(labels=exclude)
#s[s==1].head(50)

In [None]:
df_vendors_filter_columns.sort_values('name').iloc[0]['name']

In [None]:
df.query('name == "Andy\'s Grille"')['alcoholic_beverages'].iloc[0].split('\x0b')

In [None]:
name = "LuLu's Public House"
df_vendors.query('name == @name')['alcoholic_beverages'].iloc[0].split('\x0b')

In [None]:
s = df_vendors.query('name == @name')['license_text'].iloc[0]
re.split(',\s*(?![^()]*\))', s)

In [None]:
name = 'Solem Concessions Cheese Curds and Mini Donuts'
s = df_vendors.query('name == @name')['license_text'].iloc[0] 
#s.split(', ')  

re.split(',\s*(?![^()]*\))', s)

# don't split on commas within ()  !!

In [None]:
df_vendors.query('name == @name')#.loc[:,['latitude', 'longitude']]

In [None]:
df_vendors['license_text'] = df_vendors['license_text'].fillna('')
cond = df_vendors['license_text'].str.contains('donut')
df_vendors[cond]

In [None]:
df_vendors['new_this_year'].unique()

In [None]:
df_vendors['alcoholic_beverages'] = df_vendors['alcoholic_beverages'].fillna('')

cond = df_vendors['alcoholic_beverages'].str.contains('NEW')
df_vendors[cond]

In [None]:
names = ['The Blue Barn', 'LuLu\'s Public House', 'Fluffy\'s Hand Cut Donuts', 'Minnesota Wine Country', 'O\'Gara\'s at the Fair', 'Pronto Pups', 'The Hangar', 'Turkey to Go']
markers = df_vendors.query('name in @names').loc[:,['name', 'latitude', 'longitude']]

In [None]:
colors = ['red', 'blue', 'green', 'purple', 'orange', 'pink',
          'darkred', 'darkblue', 'darkgreen', 'darkpurple',
          'lightred', 'lightgray',
          'beige', 'cadetblue', 'white', 'gray', 'black' ]

In [None]:
#https://getbootstrap.com/docs/3.3/components/

import folium
m = folium.Map(location=[44.98106, -93.174351], zoom_start=17)

for marker in markers.itertuples():
    color = colors[names.index(marker.name)]
    folium.Marker(
        location=[marker.latitude, marker.longitude],
        popup=marker.name,
        icon=folium.Icon(color=color, icon="star-empty"),    #  heart  glass  star  cutlery  star   music     tar-empty
    ).add_to(m)

m

In [None]:
# (NEW)  (ONLY AT THE FAIR)  Seltzer
df_drinks['is_new'] = df_drinks['drink'].str.contains('NEW')
df_drinks['is_only_at_fair'] = df_drinks['drink'].str.contains('ONLY AT THE FAIR')
df_drinks['is_seltzer'] = df_drinks['drink'].str.contains('Seltzer')

df_drinks['drink'] = df_drinks['drink'].str.replace('(NEW)', '', regex=False)
df_drinks['drink'] = df_drinks['drink'].str.replace('(ONLY AT THE FAIR)', '', regex=False)
df_drinks['drink'] = df_drinks['drink'].str.strip()

print(df_drinks.shape)
df_drinks.head()

df_drinks = df_drinks.drop(columns=['alcoholic_beverages', 'license_text', 'keywords', 'is_food', 'categories'])

print(df_drinks.shape)
#df_drinks.head()
df_drinks.sample(50)
