# Venue Visit EDA

In [0]:
import pyspark.sql.functions as f
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [0]:
# for clear plotting on Macs
%config InlineBackend.figure_format='retina'

In [0]:
# read in the visit data
visit=spark.read.load('/user/hive/warehouse/mfour_gold_cx.db/panelist_venue_visit/')
visit.columns

In [0]:
visit.printSchema()

In [0]:
# create a column of arrived year
visit = visit.withColumn('arrive_year', f.substring('utc_arrived_date', 1,4))
# create a column of month-date pairs
visit = visit.withColumn('arrive_month_date', f.substring('utc_arrived_date', 6,5))
# create a column of year-month pairs
visit = visit.withColumn('arrive_year_month', f.substring('utc_arrived_date', 1,7))
visit.display(15)

In [0]:
visit.groupBy( 'arrive_year').count().orderBy('arrive_year', ascending=False).show()

In [0]:
visit_2018_to_2020 = visit.filter(visit["arrive_year"].isin(["2018", "2019", "2020"]))

In [0]:
visit_2018_to_2020.count()

In [0]:
top30_cat = visit.filter((visit["arrive_year"] == "2019") | (visit["arrive_year"] == "2020")).groupBy('venue_category').count().orderBy('count', ascending=False).limit(30)
# add an index column to df
top30_cat = top30_cat.select("*").withColumn("id", f.monotonically_increasing_id())

In [0]:
top30_cat.display()

In [0]:
cat_1 = top30_cat.where(f.col("id").between(0,4)).select("venue_category").rdd.flatMap(lambda x: x).collect()
cat_2 = top30_cat.where(f.col("id").between(5,9)).select("venue_category").rdd.flatMap(lambda x: x).collect()
cat_3 = top30_cat.where(f.col("id").between(10,14)).select("venue_category").rdd.flatMap(lambda x: x).collect()
cat_4 = top30_cat.where(f.col("id").between(15,19)).select("venue_category").rdd.flatMap(lambda x: x).collect()
cat_5 = top30_cat.where(f.col("id").between(20,24)).select("venue_category").rdd.flatMap(lambda x: x).collect()
cat_6 = top30_cat.where(f.col("id").between(25,29)).select("venue_category").rdd.flatMap(lambda x: x).collect()


In [0]:
cat_count = visit_2019_2020.groupBy('utc_arrived_date', 'venue_category').count().orderBy('utc_arrived_date', ascending=True)
df = cat_count.filter(cat_count["venue_category"].isin(top30_cat.select("venue_category").rdd.flatMap(list).collect()))
df.display(25)

In [0]:
# # test if the filter was correctly executed 
# df.groupBy('venue_category').count().orderBy('count', ascending=True).display()

In [0]:
df1 = df.filter(df["venue_category"].isin(cat_1))
df2 = df.filter(df["venue_category"].isin(cat_2))
df3 = df.filter(df["venue_category"].isin(cat_3))
df4 = df.filter(df["venue_category"].isin(cat_4))
df5 = df.filter(df["venue_category"].isin(cat_5))
df6 = df.filter(df["venue_category"].isin(cat_6))

df_pd1 = df1.toPandas()
df_pd2 = df2.toPandas()
df_pd3 = df3.toPandas()
df_pd4 = df4.toPandas()
df_pd5 = df5.toPandas()
df_pd6 = df6.toPandas()

In [0]:
# set plot size
plt.figure(figsize=(20, 20))

## plot data
plt.subplot(3,2,1)
sns.lineplot(x = "utc_arrived_date", y = "count", hue = "venue_category", style="venue_category", linewidth=1, data = df_pd1)

plt.subplot(3,2,2)
sns.lineplot(x = "utc_arrived_date", y = "count", hue = "venue_category", style="venue_category", linewidth=1, data = df_pd2)

plt.subplot(3,2,3)
sns.lineplot(x = "utc_arrived_date", y = "count", hue = "venue_category", style="venue_category", linewidth=1, data = df_pd3)

plt.subplot(3,2,4)
sns.lineplot(x = "utc_arrived_date", y = "count", hue = "venue_category", style="venue_category", linewidth=1, data = df_pd4)

plt.subplot(3,2,5)
sns.lineplot(x = "utc_arrived_date", y = "count", hue = "venue_category", style="venue_category", linewidth=1, data = df_pd5)

plt.subplot(3,2,6)
sns.lineplot(x = "utc_arrived_date", y = "count", hue = "venue_category", style="venue_category", linewidth=1, data = df_pd6)

plt.show()


### categories that exhibits a clear drop during lockdown:
- fast food restaurant
- grocery stores
- American restaurant
- big box stores
- bar
- hotels
- gym / fitness center
- clothing stores

# Investigate the anomaly change in selected categories
(one-month, two-months, and three-months)

## One-month time period (2019.3.15-2019.4.15 & 2020.3.15-2020.4.15)

#### Parent category = Food, Nightlife Spot, Arts & Entertainment, Event

In [0]:
%run "/Shared/duke_university/data processing/Venue Parent Categorization"

In [0]:
# create lists of category names
food_cat = parent_categorization.filter(f.col("level_0_name") == "Food").select("category").rdd.flatMap(lambda x: x).collect()
night_spot_cat = parent_categorization.filter(f.col("level_0_name") == "Nightlife Spot").select("category").rdd.flatMap(lambda x: x).collect()
art_entertainment_cat = parent_categorization.filter(f.col("level_0_name") == "Arts & Entertainment").select("category").rdd.flatMap(lambda x: x).collect()
event_cat = parent_categorization.filter(f.col("level_0_name") == "Event").select("category").rdd.flatMap(lambda x: x).collect()


In [0]:
# make a loop to plot
plt.figure(figsize=(20,18))
cat_list = [food_cat, night_spot_cat, art_entertainment_cat, event_cat]
i=1
for cat in cat_list:
    df1 = visit_2019_2020.filter((f.col("venue_category").isin(cat)) & (f.col("utc_arrived_date").between("2019-03-15", "2019-04-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    df2 = visit_2019_2020.filter((f.col("venue_category").isin(cat)) & (f.col("utc_arrived_date").between("2020-03-15", "2020-04-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(2,2,i)
    ax.set_title(f"Category: {cat[0]}")
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df1, color='b', label="2019.3.15-2019.4.15", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df2, color='r', label="2020.3.15-2020.4.15", ax=ax)
    plt.xticks(rotation="vertical")
    plt.legend()
    i = i+1    
plt.show()

In [0]:
# visit_2019_2020.filter(f.col("venue_category_2")=="Event").display()

#### Check for outdoor categories
- level 0: Outdoors & Recreation
- level 1: Big Box Store, Drugstore, Grocery Store, Market, Organic Grocery, Pharmacy, Supermarket

In [0]:
outdoor_cat = parent_categorization.filter(f.col("level_0_name") == "Outdoors & Recreation").select("category").rdd.flatMap(lambda x: x).collect()

In [0]:
visit_2019_2020.filter(f.col("venue_category").isin(outdoor_cat)).display()

panelist_venue_visit_sk,panelist_id,utc_arrived_date,utc_arrived_at,local_arrived_at,utc_departed_at,local_departed_at,local_arrivaled_date_sk,local_departed_date_sk,Time_of_the_day,Duration,confidence,venue_id,venue_name,venue_category,venue_category_2,venue_category_3,venue_category_4,venue_category_5,venue_chain_name,venue_chain_name_2,lat,lng,timezone,address,city,state,postal_code,country,dma_name,msa_name,visit_permutation_hash,invocation_id,loaded_at,loaded_by,updated_at,updated_by,arrive_year,arrive_month_date


In [0]:
# level 0: Outdoors & Recreation
df1 = visit_2019_2020.filter((f.col("venue_category").isin(outdoor_cat)) & (f.col("utc_arrived_date").between("2019-03-15", "2019-04-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
df2 = visit_2019_2020.filter((f.col("venue_category").isin(outdoor_cat)) & (f.col("utc_arrived_date").between("2020-03-15", "2020-04-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()

plt.figure(figsize=(12,8))
sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df1, color='b', label="2019.3.15-2019.4.15")
sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df2, color='r', label="2020.3.15-2020.4.15")
plt.xticks(rotation = 45)
plt.title("Level 0: Outdoors & Recreation", fontsize=16)
plt.legend()
plt.show()

In [0]:
# Level 1: Big Box Store, Drugstore, Grocery Store, Market, Organic Grocery, Pharmacy, Supermarket
# make a loop to plot
plt.figure(figsize=(20,30))
level1_list = ["Big Box Store", "Drugstore", "Grocery Store", "Market", "Organic Grocery", "Pharmacy", "Supermarket"]
i=1
for cat in level1_list:
    df1 = visit_2019_2020.filter(((f.col("venue_category_2") == cat) | (f.col("venue_category") == cat)) & (f.col("utc_arrived_date").between("2019-03-15", "2019-04-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    df2 = visit_2019_2020.filter(((f.col("venue_category_2") == cat) | (f.col("venue_category") == cat)) & (f.col("utc_arrived_date").between("2020-03-15", "2020-04-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    plt.subplot(4,2,i).set_title(f"Category: {cat}")
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df1, color='b', label="2019.3.15-2019.4.15")
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df2, color='r', label="2020.3.15-2020.4.15")
    plt.xticks(rotation="vertical")
    plt.legend()
    i = i+1    
plt.show()

## Three-month time period (2019.3.15-2019.6.15 & 2020.3.15-2020.6.15)

#### Parent category = Food, Nightlife Spot, Arts & Entertainment, Event

In [0]:
# make a loop to plot
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
plt.figure(figsize=(20, 18))
i=1
for cat in cat_list:
    df1 = visit_2019_2020.filter((f.col("venue_category").isin(cat)) & (f.col("utc_arrived_date").between("2019-03-15", "2019-06-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    df2 = visit_2019_2020.filter((f.col("venue_category").isin(cat)) & (f.col("utc_arrived_date").between("2020-03-15", "2020-06-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(2,2,i)
    ax.set_title(f"Category: {cat[0]}", fontsize=16)
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df1, color='b', label="2019.3.15-2019.6.15", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df2, color='r', label="2020.3.15-2020.6.15", ax=ax)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
    plt.legend()
    i = i+1
plt.tight_layout()
plt.show()

#### Check for outdoor categories
- level 0: Outdoors & Recreation
- level 1: Big Box Store, Drugstore, Grocery Store, Market, Organic Grocery, Pharmacy, Supermarket

In [0]:
# level 0: Outdoors & Recreation
df1 = visit_2019_2020.filter((f.col("venue_category").isin(outdoor_cat)) & (f.col("utc_arrived_date").between("2019-03-15", "2019-06-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
df2 = visit_2019_2020.filter((f.col("venue_category").isin(outdoor_cat)) & (f.col("utc_arrived_date").between("2020-03-15", "2020-06-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()

fig, ax = plt.subplots(figsize=(12,8))
sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df1, color='b', label="2019.3.15-2019.6.15", ax=ax)
sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df2, color='r', label="2020.3.15-2020.6.15", ax=ax)
ax.xaxis.set_major_locator(ticker.MaxNLocator(4))
plt.title("Level 0: Outdoors & Recreation", fontsize=16)
plt.legend()
plt.show()

In [0]:
# Level 1: Big Box Store, Drugstore, Grocery Store, Market, Organic Grocery, Pharmacy, Supermarket
# make a loop to plot
plt.figure(figsize=(20,30))
level1_list = ["Big Box Store", "Drugstore", "Grocery Store", "Market", "Organic Grocery", "Pharmacy", "Supermarket"]
i=1
for cat in level1_list:
    df1 = visit_2019_2020.filter(((f.col("venue_category_2") == cat) | (f.col("venue_category") == cat)) & (f.col("utc_arrived_date").between("2019-03-15", "2019-06-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    df2 = visit_2019_2020.filter(((f.col("venue_category_2") == cat) | (f.col("venue_category") == cat)) & (f.col("utc_arrived_date").between("2020-03-15", "2020-06-15"))).groupBy('utc_arrived_date', 'arrive_month_date').count().orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(4,2,i)
    ax.set_title(f"Level 1 Category: {cat}", fontsize=16)
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df1, color='b', label="2019.3.15-2019.6.15", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "count", linewidth=1.5, data = df2, color='r', label="2020.3.15-2020.6.15", ax=ax)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(4)) 
    ax.legend()
    i = i+1
plt.tight_layout()
plt.show()

# Update Sept 24: For every category, generate a visit number ratio and then plot over 2019 and 2020

#### Parent category = Food, Nightlife Spot, Arts & Entertainment, Event

In [0]:
%run "/Shared/duke_university/data processing/Venue Parent Categorization"

In [0]:
# create lists of category names
food_cat = parent_categorization.filter(f.col("level_0_name") == "Food").select("category").rdd.flatMap(lambda x: x).collect()
night_spot_cat = parent_categorization.filter(f.col("level_0_name") == "Nightlife Spot").select("category").rdd.flatMap(lambda x: x).collect()
art_entertainment_cat = parent_categorization.filter(f.col("level_0_name") == "Arts & Entertainment").select("category").rdd.flatMap(lambda x: x).collect()
event_cat = parent_categorization.filter(f.col("level_0_name") == "Event").select("category").rdd.flatMap(lambda x: x).collect()
outdoor_cat = parent_categorization.filter(f.col("level_0_name") == "Outdoors & Recreation").select("category").rdd.flatMap(lambda x: x).collect()
college_cat = parent_categorization.filter(f.col("level_0_name") == "College & University").select("category").rdd.flatMap(lambda x: x).collect()
professional_cat = parent_categorization.filter(f.col("level_0_name") == "Professional & Other Places").select("category").rdd.flatMap(lambda x: x).collect()
residence_cat = parent_categorization.filter(f.col("level_0_name") == "Residence").select("category").rdd.flatMap(lambda x: x).collect()
shop_service_cat = parent_categorization.filter(f.col("level_0_name") == "Shop & Service").select("category").rdd.flatMap(lambda x: x).collect()
transportation_cat = parent_categorization.filter(f.col("level_0_name") == "Travel & Transportation").select("category").rdd.flatMap(lambda x: x).collect()


######
others = parent_categorization.filter(f.col("level_0_name").isin(["College & University", "Professional & Other Places", "Residence", "Shop & Service"])).select("category").rdd.flatMap(lambda x: x).collect()


In [0]:
visit_2018_to_2020 = visit.filter(visit["arrive_year"].isin(["2018", "2019", "2020"]))

In [0]:
# add a column of parent category onto visit_2018_to_2020 df
visit_2018_to_2020 = visit_2018_to_2020.join(parent_categorization, visit_2018_to_2020.venue_category==parent_categorization.category, "left").drop("category", "level_1_name").withColumnRenamed("level_0_name", "parent_category")
#visit_2018_to_2020.display()


In [0]:
visit_2018_to_2020.columns

### add region column into the df

In [0]:
midwest = ["IL", "IN", "IA", "KS", "MI", "MN", "MO", "NE", "ND", "OH", "SD", "WI"]
northeast = ["CT",	"DE",	"ME",	"MD",	"MA",	"NH",	"NJ",	"NY",	"PA",	"RI",	"VT"]
southeast = ["AL",	"AR",	"FL",	"GA",	"KY",	"LA",	"MS",	"NC",	"SC",	"TN"]
southwest = ["AZ",	"NM",	"OK",	"TX",	"VA",	"WV"]
west = ["AK",	"CA",	"CO",	"HI",	"ID",	"MT",	"NV",	"OR",	"UT",	"WA",	"WY"]

visit_2019_2020 = visit_2019_2020.withColumn("region", f.when(f.col("state").isin(midwest), "midwest").\
                          when(f.col("state").isin(northeast), "northeast").\
                          when(f.col("state").isin(southeast), "southeast").\
                          when(f.col("state").isin(southwest), "southwest").\
                          otherwise("west"))


visit_2019_2020.display()

In [0]:
# test food_cat
df1 = visit_2019_2020.filter((f.col("venue_category").isin(food_cat)) & (f.col("arrive_year").isin([2019, 2020]))).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_cat").orderBy('arrive_month_date', ascending=True)
df11 = visit_2019_2020.filter(f.col("arrive_year").isin([2019, 2020])).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_all").orderBy('arrive_month_date', ascending=True)

In [0]:
food = df1.join(df11, on=["arrive_year", "utc_arrived_date", "arrive_month_date", "region"]).orderBy('arrive_year', 'arrive_month_date', ascending=True)
food = food.withColumn("visit_ratio", f.round(f.col("cnt_for_cat") / f.col("cnt_for_all"), 4))
food.display()

In [0]:
import matplotlib.ticker as ticker

# Category: food
df1 = visit_2019_2020.filter((f.col("venue_category").isin(food_cat)) & (f.col("arrive_year").isin([2019, 2020]))).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_cat").orderBy('arrive_month_date', ascending=True)
df11 = visit_2019_2020.filter(f.col("arrive_year").isin([2019, 2020])).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_all").orderBy('arrive_month_date', ascending=True)

food = df1.join(df11, on=["arrive_year", "utc_arrived_date", "arrive_month_date", "region"]).orderBy('arrive_year', 'arrive_month_date', ascending=True)
food = food.withColumn("visit_ratio", f.round(f.col("cnt_for_cat") / f.col("cnt_for_all"), 4))


fig = plt.figure(figsize=(20,18))
# plt.title("Category: Food")
i = 1
for reg in ["west", "southeast", "northeast", "midwest", "southwest"]:
    df_2019 = food.filter((f.col('arrive_year') == 2019) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    df_2020 = food.filter((f.col('arrive_year') == 2020) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(3,2,i)
    ax.set_title(f"Region: {reg}")
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2019, color='b', label="Year 2019", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2020, color='r', label="Year 2020", ax=ax)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(12))
    i = i+1
fig.suptitle('Category: Food', fontsize=24)
plt.show()

In [0]:
# night spot
df1 = visit_2019_2020.filter((f.col("venue_category").isin(night_spot_cat)) & (f.col("arrive_year").isin([2019, 2020]))).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_cat").orderBy('arrive_month_date', ascending=True)

df11 = visit_2019_2020.filter(f.col("arrive_year").isin([2019, 2020])).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_all").orderBy('arrive_month_date', ascending=True)

night_spot = df1.join(df11, on=["arrive_year", "utc_arrived_date", "arrive_month_date", "region"]).orderBy('arrive_year', 'arrive_month_date', ascending=True)
night_spot = night_spot.withColumn("visit_ratio", f.round(f.col("cnt_for_cat") / f.col("cnt_for_all"), 4))
#food.display()

fig = plt.figure(figsize=(20,18))
#plt.title("Category: Night Spot")
i = 1
for reg in ["west", "southeast", "northeast", "midwest", "southwest"]:
    df_2019 = night_spot.filter((f.col('arrive_year') == 2019) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    df_2020 = night_spot.filter((f.col('arrive_year') == 2020) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(3,2,i)
    ax.set_title(f"Region: {reg}")
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2019, color='b', label="Year 2019", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2020, color='r', label="Year 2020", ax=ax)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(12))
    i = i+1
fig.suptitle('Category: Night Spot', fontsize=24)
plt.show()

In [0]:
# Art entertainment
df1 = visit_2019_2020.filter((f.col("venue_category").isin(art_entertainment_cat)) & (f.col("arrive_year").isin([2019, 2020]))).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_cat").orderBy('arrive_month_date', ascending=True)
df11 = visit_2019_2020.filter(f.col("arrive_year").isin([2019, 2020])).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_all").orderBy('arrive_month_date', ascending=True)

art_entertainment = df1.join(df11, on=["arrive_year", "utc_arrived_date", "arrive_month_date", "region"]).orderBy('arrive_year', 'arrive_month_date', ascending=True)
art_entertainment = art_entertainment.withColumn("visit_ratio", f.round(f.col("cnt_for_cat") / f.col("cnt_for_all"), 4))
#food.display()

fig = plt.figure(figsize=(20,18))
#plt.title("Category: Art & Entertainment")
i = 1
for reg in ["west", "southeast", "northeast", "midwest", "southwest"]:
    df_2019 = art_entertainment.filter((f.col('arrive_year') == 2019) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    df_2020 = art_entertainment.filter((f.col('arrive_year') == 2020) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(3,2,i)
    ax.set_title(f"Region: {reg}")
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2019, color='b', label="Year 2019", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2020, color='r', label="Year 2020", ax=ax)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(12))
    i = i+1
fig.suptitle('Category: Art & Entertainment', fontsize=28)
plt.show()

In [0]:
# Event
df1 = visit_2019_2020.filter((f.col("venue_category").isin(event_cat)) & (f.col("arrive_year").isin([2019, 2020]))).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_cat").orderBy('arrive_month_date', ascending=True)
df11 = visit_2019_2020.filter(f.col("arrive_year").isin([2019, 2020])).groupBy('arrive_year', 'utc_arrived_date', 'arrive_month_date', 'region').count().withColumnRenamed("count", "cnt_for_all").orderBy('arrive_month_date', ascending=True)

event = df1.join(df11, on=["arrive_year", "utc_arrived_date", "arrive_month_date", "region"]).orderBy('arrive_year', 'arrive_month_date', ascending=True)
event = event.withColumn("visit_ratio", f.round(f.col("cnt_for_cat") / f.col("cnt_for_all"), 4))
#food.display()

fig = plt.figure(figsize=(20,18))
#plt.title("Category: Art & Entertainment")
i = 1
for reg in ["west", "southeast", "northeast", "midwest", "southwest"]:
    df_2019 = event.filter((f.col('arrive_year') == 2019) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    df_2020 = event.filter((f.col('arrive_year') == 2020) & (f.col("region") == reg)).orderBy('arrive_month_date', ascending=True).toPandas()
    ax = plt.subplot(3,2,i)
    ax.set_title(f"Region: {reg}")
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2019, color='b', label="Year 2019", ax=ax)
    sns.lineplot(x = "arrive_month_date", y = "visit_ratio", linewidth=1.5, data = df_2020, color='r', label="Year 2020", ax=ax)
    ax.xaxis.set_major_locator(ticker.MaxNLocator(12))
    i = i+1
fig.suptitle('Category: Event', fontsize=24)
plt.show()