In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=554c0fda96d55ac9b21eeb80d065e9002195fb331ea123bee665cbdb32651e61
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
# import dependencies
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder \
  .appName("Merge DFs") \
  .config("spark.executor.memory", "4g") \
  .config("spark.executor.cores", "4") \
  .getOrCreate()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing
This note book is for merging data to build one outstanding dataset that we share amongst our models. First, we will preprocess each DataFrame for cohesion. Then we will merge DataFrames.

### ZHVI

In [None]:
zhvi_df = pd.read_csv('/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/processed_zhvi_df.csv')
zhvi_df = zhvi_df.drop(columns=['Unnamed: 0'])
zhvi_df['Date'] = pd.to_datetime(zhvi_df.Date + '-01')
zhvi_df.head()

Unnamed: 0,City,Date,ZHVI
0,New York,2010-01-01,393767.004762
1,Los Angeles,2010-01-01,418680.256759
2,Chicago,2010-01-01,201888.6066
3,Dallas,2010-01-01,150742.550106
4,Seattle,2010-01-01,333469.123267


### Crime Data

In [None]:
cities = [('chicago','Chicago'), ('dallas', "Dallas"), ('losangeles', 'Los Angeles'), ('newyork', 'New York'), ('seattle', 'Seattle')]

In [None]:
# Create result df
crime_df = pd.DataFrame()

for city, city_name in cities:
  print(f"processing {city}-{city_name}")

  # Load df from csv
  city_property_crime_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Crime/{city}_property_crimes_by_month.csv')
  city_violent_crime_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Crime/{city}_violent_crimes_by_month.csv')

  # Rename columns
  city_property_crime_df.rename(columns={"NumberOfIncident": "PropertyCrimeIncident"}, inplace=True)
  city_violent_crime_df.rename(columns={"NumberOfIncident": "ViolentCrimeIncident"}, inplace=True)

  # Merge dataframes
  merged_df = pd.merge(city_property_crime_df, city_violent_crime_df, on='Date')

  # Add city column
  merged_df['City'] = city_name

  # Append the merged DataFrame for the current city to the combined DataFrame
  crime_df = pd.concat([crime_df, merged_df], ignore_index=True)

crime_df['Date'] = pd.to_datetime(crime_df.Date + '-01')
crime_df.head()

processing chicago-Chicago
processing dallas-Dallas
processing losangeles-Los Angeles
processing newyork-New York
processing seattle-Seattle


Unnamed: 0,Date,PropertyCrimeIncident,ViolentCrimeIncident,City
0,2001-01-01,18521.0,11172.0,Chicago
1,2001-02-01,16068.0,10088.0,Chicago
2,2001-03-01,18915.0,12906.0,Chicago
3,2001-04-01,19099.0,13392.0,Chicago
4,2001-05-01,19794.0,14403.0,Chicago


### New Home Permit

In [None]:
home_permit_df = pd.read_csv('/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/NewHomeBuilt/newhome_permits_2010_2024.csv')
home_permit_df = pd.melt(home_permit_df, id_vars=['city'], var_name='Date', value_name='NewHomePermits')
home_permit_df.rename(columns={'city': 'City'}, inplace=True)
home_permit_df['Date'] = pd.to_datetime(home_permit_df.Date + '-01')
home_permit_df.head()

Unnamed: 0,City,Date,NewHomePermits
0,New York,2010-01-01,874.0
1,Chicago,2010-01-01,345.0
2,Los Angeles,2010-01-01,433.0
3,Dallas,2010-01-01,1565.0
4,Seattle,2010-01-01,1190.0


### Population Migration

In [None]:
def process_and_interpolate(df, city_name):
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')

  # Set the 'Year' column as the index
  df.set_index('Year', inplace=True)

  # Create a date range for the monthly index within the range of years
  monthly_index = pd.date_range(start='2010-01-01', end='2022-12-01', freq='MS')

  # Reindex the DataFrame to the new monthly index
  df_monthly = df.reindex(monthly_index)

  # Interpolate the missing values
  df_monthly = df_monthly.interpolate(method='linear')

  df_monthly = df_monthly.round(0).astype(int)

  df_monthly['City'] = city_name

  df_monthly.reset_index(inplace=True)

  df_monthly.rename(columns={'index': 'Date'}, inplace=True)

  df_monthly['Date'] = df_monthly['Date'].dt.strftime('%Y-%m')

  return df_monthly

In [None]:
cities = [('Chicago','Chicago'), ('Dallas', "Dallas"), ('LosAngeles', 'Los Angeles'), ('NewYork', 'New York'), ('Seattle', 'Seattle')]

In [None]:
all_cities_data = []

for city, city_name in cities:
  print(f"processing {city}-{city_name}")
  city_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Population/Population_{city}_2010_2022.csv')
  city_df = process_and_interpolate(city_df, city_name)
  all_cities_data.append(city_df)

population_migration_df = pd.concat(all_cities_data, ignore_index=True)
population_migration_df['Date'] = pd.to_datetime(population_migration_df.Date + '-01')
population_migration_df.head()

processing Chicago-Chicago
processing Dallas-Dallas
processing LosAngeles-Los Angeles
processing NewYork-New York
processing Seattle-Seattle


Unnamed: 0,Date,Population_In_Housing_Units,Owner_Occupied_Population,Renter-Occupied-Population,OOP_Moved_From_Different_County,OOP_Moved_From_Different_State,OOP_Moved_From_Abroad,ROP_Moved_From_Different_County,ROP_Moved_From_Different_State,ROP_Moved_From_Abroad,City
0,2010-01-01,9206524,6353379,2853145,63533,44473,12706,91300,97006,34237,Chicago
1,2010-02-01,9208651,6351028,2857623,62982,44984,13228,92170,97885,34291,Chicago
2,2010-03-01,9210778,6348678,2862100,62432,45494,13751,93040,98764,34344,Chicago
3,2010-04-01,9212906,6346328,2866578,61881,46005,14273,93910,99643,34398,Chicago
4,2010-05-01,9215033,6343977,2871056,61331,46516,14796,94780,100522,34452,Chicago


### Travel Data

In [None]:
shared_drive = 'drive/MyDrive/MADS Capstone Team 23/Data'
trns_path = f'{shared_drive}/processed/travel_data/travel.csv'
travel = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .csv(trns_path)
travel.createOrReplaceTempView("travel")

In [None]:
query = '''
  SELECT
    MAKE_DATE(YEAR, MONTH, 1) AS Date,
    METRO AS City,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_OUTBOUND_CNT, 0)) AS cargo_domestic_outbound,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_INBOUND_CNT, 0)) AS cargo_domestic_inbound,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_OUTBOUND_CNT, 0)) AS passenger_domestic_outbound,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_INBOUND_CNT, 0)) AS passenger_domestic_inbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_OUTBOUND_CNT, 0)) AS cargo_intl_outbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_INBOUND_CNT, 0)) AS cargo_intl_inbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_OUTBOUND_CNT, 0)) AS passenger_intl_outbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_INBOUND_CNT, 0)) AS passenger_intl_inbound
  FROM travel
  GROUP BY 1,2
  ORDER BY 1,2
'''

results = spark.sql(query)
travel_df = results.toPandas()
travel_df.Date = pd.to_datetime(travel_df.Date)
travel_df.head()

Unnamed: 0,Date,City,cargo_domestic_outbound,cargo_domestic_inbound,passenger_domestic_outbound,passenger_domestic_inbound,cargo_intl_outbound,cargo_intl_inbound,passenger_intl_outbound,passenger_intl_inbound
0,2000-01-01,Chicago,1.0,0.0,142.0,136.0,26.0,22.0,78.0,75.0
1,2000-01-01,Dallas,1.0,0.0,132.0,136.0,8.0,5.0,38.0,36.0
2,2000-01-01,Los Angeles,3.0,2.0,98.0,91.0,28.0,25.0,90.0,83.0
3,2000-01-01,New York,3.0,0.0,114.0,110.0,44.0,42.0,136.0,137.0
4,2000-01-01,Seattle,1.0,0.0,104.0,105.0,9.0,4.0,25.0,25.0


### Politics Data

In [None]:
trns_path = f'{shared_drive}/processed/congress_data_files/metro_politics.csv'
congress_df = pd.read_csv(trns_path)
congress_df.fillna(0, inplace = True)
congress_df
congress_df['Date'] = pd.to_datetime(congress_df.spons_legis_intro_yr.astype(str) + '-' +
                                     congress_df.spons_legis_intro_mnth.astype(str) + '-01')
congress_df.drop(['spons_legis_intro_mnth', 'spons_legis_intro_yr'], axis = 1, inplace = True)
congress_df.rename({'metro' : 'City'}, axis = 1, inplace = True)
congress_df.head()

Unnamed: 0,City,dem_senate_cnt,rep_senate_cnt,dem_house_cnt,rep_house_cnt,other_senate_cnt,other_house_cnt,dem_spons_legis_cnt,rep_spons_legis_cnt,other_spons_legis_cnt,dem_cospons_legis_cnt,rep_cospons_legis_cnt,other_cospons_legis_cnt,Date
0,Chicago,1,1,9,7,0,0,12.0,1.0,0.0,54,19,0,1999-01-01
1,Dallas,0,2,3,6,0,0,6.0,14.0,0.0,29,36,0,1999-01-01
2,Los Angeles,2,0,10,14,0,0,20.0,21.0,0.0,56,39,0,1999-01-01
3,New York,2,0,22,8,0,0,97.0,29.0,0.0,81,41,0,1999-01-01
4,Seattle,1,1,4,2,0,0,0.0,0.0,0.0,28,26,0,1999-01-01


### Climate Data

In [None]:
trns_path = f'{shared_drive}/processed/climate_data/city_weather'
weather_df = pd.read_csv(trns_path)
weather_df['Date'] = pd.to_datetime(weather_df.yearmonth + '-01')
weather_df.drop(['Unnamed: 0', 'station', 'lat', 'long', 'elevation',
              'station_nm', 'yearmonth'], axis = 1, inplace = True)
weather_df.rename({'name' : 'City'}, axis = 1, inplace = True)
weather_df.head()

Unnamed: 0,City,days_with_thunderstorms,precipitation,percent_of_possible_sunshine,relative_humidity_avg,relative_humidity_min,relative_humidity_max,snowfall,temp_avg,temp_max,temp_min,total_sunshine,Date
0,Seattle,,3.77,,,,,,40.3,45.2,35.3,,2000-01-01
1,Seattle,1.0,5.26,,,,,,43.7,49.9,37.5,,2000-02-01
2,Seattle,,2.83,,,,,,44.5,50.9,38.0,,2000-03-01
3,Seattle,,1.48,,,,,,50.9,58.8,43.0,,2000-04-01
4,Seattle,,3.27,,,,,,53.8,60.7,47.0,,2000-05-01


### HUD

In [None]:
trns_path = f'{shared_drive}/processed/hud_40th_percentiles/hud_data.json'
hud_df =  pd.read_json(trns_path, orient='records',lines=True)

city_name_map = {
    'Seattle-Tacoma-Bellevue, WA Metro Area':'Seattle',
    'Chicago-Naperville-Elgin, IL-IN-WI Metro Area':'Chicago',
    'Los Angeles-Long Beach-Anaheim, CA Metro Area': 'Los Angeles',
    'New York-Newark-Jersey City, NY-NJ-PA Metro Area': 'New York',
    'Dallas-Fort Worth-Arlington, TX Metro Area':'Dallas'
}

hud_df['City'] = hud_df['metro_name'].map(city_name_map)
hud_df['City'].fillna(np.nan, inplace=True)
hud_df=hud_df[hud_df['City'].isin(city_name_map.values())]
#technically compiled in May but since represents whole year call Jan
hud_df['Date'] = pd.to_datetime(hud_df['year'].astype(str)+'-Jan'+'-01')
hud_df.drop(['metro_name', 'metro_code', 'year'], axis = 1, inplace = True)
hud_df.head()


Unnamed: 0,percentile,studio,1_bedroom,2_bedroom,3_bedroom,4_bedroom,City,Date
50,40.0,517.916308,624.719827,746.86894,937.264145,1047.851746,Chicago,2000-01-01
51,41.25,572.106821,688.615109,824.3932,1034.881676,1156.255843,Chicago,2001-01-01
52,41.25,601.499685,724.267191,867.198749,1088.332595,1217.11636,Chicago,2002-01-01
53,41.25,625.91981,753.641114,902.320601,1132.149346,1265.827537,Chicago,2003-01-01
54,41.25,641.31708,771.879444,924.358024,1159.96946,1296.413014,Chicago,2004-01-01


### Stocks

In [None]:
trns_path = f'{shared_drive}/processed/HousingStocks/capstone_stocks.xlsx'
xl = pd.ExcelFile(trns_path)
stock_dfs={}
tickers=xl.sheet_names[1:]  # get sheet names
for ticker in tickers:
  df = xl.parse(ticker)
  df.columns = df.columns.str.replace('*', '', regex=False)
  df.drop(['Company Name', 'Ticker'], axis = 1, inplace = True)
  df.columns =[f"{ticker}_{col}" if col != 'Date' else 'Date' for col in df.columns] #first column should always be Date
  df['Date'] = pd.to_datetime(df['Date'])
  stock_dfs[ticker] = df

stock_df=pd.DataFrame()
for stock in stock_dfs.keys():
  if len(stock_df) ==0:
    stock_df = stock_dfs[stock]
  else:
    stock_df= stock_df.merge(stock_dfs[stock], on = ['Date'], how = 'outer')

stock_df.head()

Unnamed: 0,Date,CBRE_Open,CBRE_High,CBRE_Low,CBRE_Close,CBRE_Adj Close,CBRE_Volume,HOUS_Open,HOUS_High,HOUS_Low,...,RMAX_Low,RMAX_Close,RMAX_Adj Close,RMAX_Volume,RDFN_Open,RDFN_High,RDFN_Low,RDFN_Close,RDFN_Adj Close,RDFN_Volume
0,2024-07-22,97.84,98.82,96.38,98.77,98.77,1319773.0,4.59,4.5963,4.38,...,9.02,9.42,9.42,157537.0,8.06,8.16,7.66,7.72,7.72,3585756.0
1,2024-07-01,88.77,99.56,85.74,98.77,98.77,19204200.0,3.31,4.88,3.15,...,7.86,9.42,9.42,4103000.0,6.0,8.81,5.49,7.72,7.72,79190500.0
2,2024-06-01,88.64,91.32,84.24,89.11,89.11,34845700.0,4.19,4.41,3.01,...,7.53,8.1,8.1,5118800.0,6.66,7.6,5.75,6.01,6.01,76716500.0
3,2024-05-01,86.94,93.4,85.09,88.07,88.07,38185700.0,4.87,5.71,4.03,...,6.96,8.09,8.09,6122600.0,5.6,8.54,5.49,6.44,6.44,106214900.0
4,2024-04-01,97.01,97.35,84.33,86.89,86.89,32536600.0,6.17,6.19,4.8,...,6.94,7.02,7.02,6145300.0,6.69,6.69,5.1,5.61,5.61,95930200.0


### Income

In [None]:
trns_path = f'{shared_drive}/processed/bls_income/BLS_income_data.json'
income_df =  pd.read_json(trns_path, orient='records',lines=True)
#many titles to explore with  o-group == major, worth exploring if time allows
#o-group total added 2009 so using total sets low bound on year
income_df = income_df[income_df['o_group']=='total']


city_name_map = {
    'Seattle-Tacoma-Bellevue, WA':'Seattle',
    'Chicago-Naperville-Elgin, IL-IN-WI':'Chicago',
    'Los Angeles-Long Beach-Anaheim, CA': 'Los Angeles',
    'New York-Newark-Jersey City, NY-NJ-PA': 'New York',
    'Dallas-Fort Worth-Arlington, TX':'Dallas'
}

income_df['City'] = income_df['area_title'].map(city_name_map)
income_df['City'].fillna(np.nan, inplace=True)
income_df=income_df[income_df['City'].isin(city_name_map.values())]
income_df['Date'] = pd.to_datetime(income_df['year'].astype(str)+'-Jan'+'-01')
income_df.drop(['area', 'area_title', 'o_group', 'occ_title'], axis = 1, inplace = True)

income_df = income_df.drop(['year', 'occ_code'], axis =1).reset_index(drop=True)

income_df.head()

Unnamed: 0,tot_emp,a_mean,a_median,h_mean,h_median,City,Date
0,3962220,49010.0,36170.0,23.56,17.39,Los Angeles,2009-01-01
1,3669800,48910.0,36830.0,23.51,17.71,Chicago,2009-01-01
2,245880,45480.0,35590.0,21.87,17.11,New York,2009-01-01
3,2047600,46110.0,34560.0,22.17,16.61,Dallas,2009-01-01
4,1404710,53240.0,42750.0,25.6,20.55,Seattle,2009-01-01


# Merge DataFrames

In [None]:
zhvi_df.shape

(780, 3)

In [None]:
merged_df = zhvi_df.merge(home_permit_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(population_migration_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(crime_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(travel_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(congress_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(weather_df, on = ['City', 'Date'], how = 'left')

#null in stock data means stock did not exist at the time. setting value to 0
merged_df = merged_df.merge(stock_df, on = ['Date'], how = 'left')
for col in list(stock_df.columns[1:]):
  merged_df[col].fillna(0, inplace=True)


#merge in HUD data fill forward yearly values for each city while preseving sort order
merged_df = merged_df.merge(hud_df, on = ['City', 'Date'], how = 'left')
merged_df['Index'] = merged_df.index
merged_df.sort_values(by=['City', 'Date'], inplace=True)

fill_list=list(hud_df.columns[:-2])
for col in fill_list:
  merged_df[col].ffill(inplace=True)

merged_df.sort_values(by='Index', inplace=True)
merged_df.drop(columns=['Index'], inplace=True)

#merge in income data fill forward yearly values for each city while preseving sort order
merged_df = merged_df.merge(income_df, on = ['City', 'Date'], how = 'left')
merged_df['Index'] = merged_df.index
merged_df.sort_values(by=['City', 'Date'], inplace=True)

fill_list=list(income_df.columns[:-2])


for col in fill_list:
  merged_df[col].ffill(inplace=True)

merged_df.sort_values(by='Index', inplace=True)
merged_df.drop(columns=['Index'], inplace=True)




merged_df.shape

(780, 87)

In [None]:
merged_df.head()

Unnamed: 0,City,Date,ZHVI,NewHomePermits,Population_In_Housing_Units,Owner_Occupied_Population,Renter-Occupied-Population,OOP_Moved_From_Different_County,OOP_Moved_From_Different_State,OOP_Moved_From_Abroad,...,studio,1_bedroom,2_bedroom,3_bedroom,4_bedroom,tot_emp,a_mean,a_median,h_mean,h_median
0,New York,2010-01-01,393767.004762,874.0,18324602,10196421,8128181,112160,61178,30589,...,1079.002546,1200.045451,1371.690954,1714.454248,1913.721353,241590,46270.0,36100.0,22.25,17.36
1,Los Angeles,2010-01-01,418680.256759,433.0,12474432,6510008,5964424,52080,26040,26040,...,999.314979,1183.694503,1460.828359,1988.891365,2365.863015,3817570,50240.0,36790.0,24.16,17.69
2,Chicago,2010-01-01,201888.6066,345.0,9206524,6353379,2853145,63533,44473,12706,...,765.730804,876.33058,994.059652,1218.447222,1372.496303,3542180,49140.0,36950.0,23.62,17.77
3,Dallas,2010-01-01,150742.550106,1565.0,6216118,4052381,2163737,81047,32419,12157,...,640.810744,703.833695,830.204024,1085.019817,1327.825169,2001860,46860.0,35280.0,22.53,16.96
4,Seattle,2010-01-01,333469.123267,1190.0,3340958,2150701,1190257,25808,19356,10753,...,745.725796,854.419345,1035.655905,1473.043003,1768.440837,1346300,54610.0,44090.0,26.25,21.2


### Fill outstanding nulls

In [None]:
nulls = merged_df.isnull().sum()
nulls[nulls > 0]

Unnamed: 0,0
dem_senate_cnt,20
rep_senate_cnt,20
dem_house_cnt,20
rep_house_cnt,20
other_senate_cnt,20
other_house_cnt,20
dem_spons_legis_cnt,20
rep_spons_legis_cnt,20
other_spons_legis_cnt,20
dem_cospons_legis_cnt,20


In [None]:
# fill days with thunderstorms & snowfall to 0 where null
# no 0's currently exist for storms implying that there is either some value or null
# snowfall is null for all LA records only, implying 0 snow
merged_df.days_with_thunderstorms.fillna(0, inplace = True)
merged_df.snowfall.fillna(0, inplace = True)
# sunshine fields are completely null...removing
merged_df.drop(['percent_of_possible_sunshine', 'total_sunshine'], axis = 1, inplace = True)
# humidity was randomly not collected at some points...fill with avg
merged_df.relative_humidity_avg = merged_df.groupby('City').relative_humidity_avg.transform(lambda x: x.fillna(x.mean()))
merged_df.relative_humidity_min = merged_df.groupby('City').relative_humidity_min.transform(lambda x: x.fillna(x.mean()))
merged_df.relative_humidity_max = merged_df.groupby('City').relative_humidity_max.transform(lambda x: x.fillna(x.mean()))

In [None]:
# nulls in political data are seemingly random...will forwards fill
nulls = merged_df.isnull().sum()
null_cols = list(nulls[nulls > 0].index)
merged_df['Date'] = pd.to_datetime(merged_df.Date)
merged_df.sort_values(['Date', 'City'], inplace = True)
merged_df[null_cols] = merged_df[null_cols].fillna(merged_df.groupby(['City'])[null_cols].ffill())

In [None]:
nulls = merged_df.isnull().sum()
nulls[nulls > 0].shape

(0,)

# Write CSV

In [None]:
trns_path = f'{shared_drive}/processed/merged.csv'
merged_df.to_csv(trns_path, index = False)