In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=4b9c46dd51c603ca1c2ba23bcc0f54d70e7894b1867f2d40ca171070ba0d1feb
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
# import dependencies
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [3]:
spark = SparkSession.builder \
  .appName("Merge DFs") \
  .config("spark.executor.memory", "4g") \
  .config("spark.executor.cores", "4") \
  .getOrCreate()

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing
This note book is for merging data to build one outstanding dataset that we share amongst our models. First, we will preprocess each DataFrame for cohesion. Then we will merge DataFrames.

### ZHVI

In [6]:
zhvi_df = pd.read_csv('/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/processed_zhvi_df.csv')
zhvi_df = zhvi_df.drop(columns=['Unnamed: 0'])
zhvi_df['Date'] = pd.to_datetime(zhvi_df.Date + '-01')
zhvi_df.head()

Unnamed: 0,City,Date,ZHVI
0,New York,2010-01-01,393767.004762
1,Los Angeles,2010-01-01,418680.256759
2,Chicago,2010-01-01,201888.6066
3,Dallas,2010-01-01,150742.550106
4,Seattle,2010-01-01,333469.123267


### Crime Data

In [7]:
cities = [('chicago','Chicago'), ('dallas', "Dallas"), ('losangeles', 'Los Angeles'), ('newyork', 'New York'), ('seattle', 'Seattle')]

In [8]:
# Create result df
crime_df = pd.DataFrame()

for city, city_name in cities:
  print(f"processing {city}-{city_name}")

  # Load df from csv
  city_property_crime_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Crime/{city}_property_crimes_by_month.csv')
  city_violent_crime_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Crime/{city}_violent_crimes_by_month.csv')

  # Rename columns
  city_property_crime_df.rename(columns={"NumberOfIncident": "PropertyCrimeIncident"}, inplace=True)
  city_violent_crime_df.rename(columns={"NumberOfIncident": "ViolentCrimeIncident"}, inplace=True)

  # Merge dataframes
  merged_df = pd.merge(city_property_crime_df, city_violent_crime_df, on='Date')

  # Add city column
  merged_df['City'] = city_name

  # Append the merged DataFrame for the current city to the combined DataFrame
  crime_df = pd.concat([crime_df, merged_df], ignore_index=True)

crime_df['Date'] = pd.to_datetime(crime_df.Date + '-01')
crime_df.head()

processing chicago-Chicago
processing dallas-Dallas
processing losangeles-Los Angeles
processing newyork-New York
processing seattle-Seattle


Unnamed: 0,Date,PropertyCrimeIncident,ViolentCrimeIncident,City
0,2001-01-01,18521.0,11172.0,Chicago
1,2001-02-01,16068.0,10088.0,Chicago
2,2001-03-01,18915.0,12906.0,Chicago
3,2001-04-01,19099.0,13392.0,Chicago
4,2001-05-01,19794.0,14403.0,Chicago


### New Home Permit

In [9]:
home_permit_df = pd.read_csv('/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/NewHomeBuilt/newhome_permits_2010_2024.csv')
home_permit_df = pd.melt(home_permit_df, id_vars=['city'], var_name='Date', value_name='NewHomePermits')
home_permit_df.rename(columns={'city': 'City'}, inplace=True)
home_permit_df['Date'] = pd.to_datetime(home_permit_df.Date + '-01')
home_permit_df.head()

Unnamed: 0,City,Date,NewHomePermits
0,New York,2010-01-01,874.0
1,Chicago,2010-01-01,345.0
2,Los Angeles,2010-01-01,433.0
3,Dallas,2010-01-01,1565.0
4,Seattle,2010-01-01,1190.0


### Population Migration

In [10]:
def process_and_interpolate(df, city_name):
  df['Year'] = pd.to_datetime(df['Year'], format='%Y')

  # Set the 'Year' column as the index
  df.set_index('Year', inplace=True)

  # Create a date range for the monthly index within the range of years
  monthly_index = pd.date_range(start='2010-01-01', end='2022-12-01', freq='MS')

  # Reindex the DataFrame to the new monthly index
  df_monthly = df.reindex(monthly_index)

  # Interpolate the missing values
  df_monthly = df_monthly.interpolate(method='linear')

  df_monthly = df_monthly.round(0).astype(int)

  df_monthly['City'] = city_name

  df_monthly.reset_index(inplace=True)

  df_monthly.rename(columns={'index': 'Date'}, inplace=True)

  df_monthly['Date'] = df_monthly['Date'].dt.strftime('%Y-%m')

  return df_monthly

In [11]:
cities = [('Chicago','Chicago'), ('Dallas', "Dallas"), ('LosAngeles', 'Los Angeles'), ('NewYork', 'New York'), ('Seattle', 'Seattle')]

In [12]:
all_cities_data = []

for city, city_name in cities:
  print(f"processing {city}-{city_name}")
  city_df = pd.read_csv(f'/content/drive/MyDrive/MADS Capstone Team 23/Data/processed/Population/Population_{city}_2010_2022.csv')
  city_df = process_and_interpolate(city_df, city_name)
  all_cities_data.append(city_df)

population_migration_df = pd.concat(all_cities_data, ignore_index=True)
population_migration_df['Date'] = pd.to_datetime(population_migration_df.Date + '-01')
population_migration_df.head()

processing Chicago-Chicago
processing Dallas-Dallas
processing LosAngeles-Los Angeles
processing NewYork-New York
processing Seattle-Seattle


Unnamed: 0,Date,Population_In_Housing_Units,Owner_Occupied_Population,Renter-Occupied-Population,OOP_Moved_From_Different_County,OOP_Moved_From_Different_State,OOP_Moved_From_Abroad,ROP_Moved_From_Different_County,ROP_Moved_From_Different_State,ROP_Moved_From_Abroad,City
0,2010-01-01,9206524,6353379,2853145,63533,44473,12706,91300,97006,34237,Chicago
1,2010-02-01,9208651,6351028,2857623,62982,44984,13228,92170,97885,34291,Chicago
2,2010-03-01,9210778,6348678,2862100,62432,45494,13751,93040,98764,34344,Chicago
3,2010-04-01,9212906,6346328,2866578,61881,46005,14273,93910,99643,34398,Chicago
4,2010-05-01,9215033,6343977,2871056,61331,46516,14796,94780,100522,34452,Chicago


### Travel Data

In [13]:
shared_drive = 'drive/MyDrive/MADS Capstone Team 23/Data'
trns_path = f'{shared_drive}/processed/travel_data/travel.csv'
travel = spark.read \
  .option("header", "true") \
  .option("inferSchema", "true") \
  .csv(trns_path)
travel.createOrReplaceTempView("travel")

In [14]:
query = '''
  SELECT
    MAKE_DATE(YEAR, MONTH, 1) AS Date,
    METRO AS City,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_OUTBOUND_CNT, 0)) AS cargo_domestic_outbound,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_INBOUND_CNT, 0)) AS cargo_domestic_inbound,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_OUTBOUND_CNT, 0)) AS passenger_domestic_outbound,
    SUM(IF(INTL_OR_DOM = 'Dom' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_INBOUND_CNT, 0)) AS passenger_domestic_inbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_OUTBOUND_CNT, 0)) AS cargo_intl_outbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Cargo', UNIQ_INBOUND_CNT, 0)) AS cargo_intl_inbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_OUTBOUND_CNT, 0)) AS passenger_intl_outbound,
    SUM(IF(INTL_OR_DOM = 'Intl' AND PASSNGR_OR_CARGO = 'Passenger', UNIQ_INBOUND_CNT, 0)) AS passenger_intl_inbound
  FROM travel
  GROUP BY 1,2
  ORDER BY 1,2
'''

results = spark.sql(query)
travel_df = results.toPandas()
travel_df.Date = pd.to_datetime(travel_df.Date)
travel_df.head()

Unnamed: 0,Date,City,cargo_domestic_outbound,cargo_domestic_inbound,passenger_domestic_outbound,passenger_domestic_inbound,cargo_intl_outbound,cargo_intl_inbound,passenger_intl_outbound,passenger_intl_inbound
0,2000-01-01,Chicago,1.0,0.0,142.0,136.0,26.0,22.0,78.0,75.0
1,2000-01-01,Dallas,1.0,0.0,132.0,136.0,8.0,5.0,38.0,36.0
2,2000-01-01,Los Angeles,3.0,2.0,98.0,91.0,28.0,25.0,90.0,83.0
3,2000-01-01,New York,3.0,0.0,114.0,110.0,44.0,42.0,136.0,137.0
4,2000-01-01,Seattle,1.0,0.0,104.0,105.0,9.0,4.0,25.0,25.0


### Politics Data

In [15]:
trns_path = f'{shared_drive}/processed/congress_data_files/metro_politics.csv'
congress_df = pd.read_csv(trns_path)
congress_df.fillna(0, inplace = True)
congress_df
congress_df['Date'] = pd.to_datetime(congress_df.spons_legis_intro_yr.astype(str) + '-' +
                                     congress_df.spons_legis_intro_mnth.astype(str) + '-01')
congress_df.drop(['spons_legis_intro_mnth', 'spons_legis_intro_yr'], axis = 1, inplace = True)
congress_df.rename({'metro' : 'City'}, axis = 1, inplace = True)
congress_df.head()

Unnamed: 0,City,dem_senate_cnt,rep_senate_cnt,dem_house_cnt,rep_house_cnt,other_senate_cnt,other_house_cnt,dem_spons_legis_cnt,rep_spons_legis_cnt,other_spons_legis_cnt,dem_cospons_legis_cnt,rep_cospons_legis_cnt,other_cospons_legis_cnt,Date
0,Chicago,1,1,9,7,0,0,12.0,1.0,0.0,54,19,0,1999-01-01
1,Dallas,0,2,3,6,0,0,6.0,14.0,0.0,29,36,0,1999-01-01
2,Los Angeles,2,0,10,14,0,0,20.0,21.0,0.0,56,39,0,1999-01-01
3,New York,2,0,22,8,0,0,97.0,29.0,0.0,81,41,0,1999-01-01
4,Seattle,1,1,4,2,0,0,0.0,0.0,0.0,28,26,0,1999-01-01


### Climate Data

In [16]:
trns_path = f'{shared_drive}/processed/climate_data/city_weather'
weather_df = pd.read_csv(trns_path)
weather_df['Date'] = pd.to_datetime(weather_df.yearmonth + '-01')
weather_df.drop(['Unnamed: 0', 'station', 'lat', 'long', 'elevation',
              'station_nm', 'yearmonth'], axis = 1, inplace = True)
weather_df.rename({'name' : 'City'}, axis = 1, inplace = True)
weather_df.head()

Unnamed: 0,City,days_with_thunderstorms,precipitation,percent_of_possible_sunshine,relative_humidity_avg,relative_humidity_min,relative_humidity_max,snowfall,temp_avg,temp_max,temp_min,total_sunshine,Date
0,Seattle,,3.77,,,,,,40.3,45.2,35.3,,2000-01-01
1,Seattle,1.0,5.26,,,,,,43.7,49.9,37.5,,2000-02-01
2,Seattle,,2.83,,,,,,44.5,50.9,38.0,,2000-03-01
3,Seattle,,1.48,,,,,,50.9,58.8,43.0,,2000-04-01
4,Seattle,,3.27,,,,,,53.8,60.7,47.0,,2000-05-01


### HUD

# Merge DataFrames

In [17]:
zhvi_df.shape

(780, 3)

In [18]:
merged_df = zhvi_df.merge(home_permit_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(population_migration_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(crime_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(travel_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(congress_df, on = ['City', 'Date'], how = 'left')
merged_df = merged_df.merge(weather_df, on = ['City', 'Date'], how = 'left')
merged_df.shape

(780, 46)

In [21]:
merged_df.head()

Unnamed: 0,City,Date,ZHVI,NewHomePermits,Population_In_Housing_Units,Owner_Occupied_Population,Renter-Occupied-Population,OOP_Moved_From_Different_County,OOP_Moved_From_Different_State,OOP_Moved_From_Abroad,...,precipitation,percent_of_possible_sunshine,relative_humidity_avg,relative_humidity_min,relative_humidity_max,snowfall,temp_avg,temp_max,temp_min,total_sunshine
0,New York,2010-01-01,393767.004762,874.0,18324602,10196421,8128181,112160,61178,30589,...,2.09,,58.0,45.0,71.0,2.1,32.5,38.1,27.0,
1,Los Angeles,2010-01-01,418680.256759,433.0,12474432,6510008,5964424,52080,26040,26040,...,4.94,,57.0,36.0,77.0,,60.2,69.4,50.9,
2,Chicago,2010-01-01,201888.6066,345.0,9206524,6353379,2853145,63533,44473,12706,...,1.13,,77.0,66.0,87.0,9.1,21.9,27.4,16.4,
3,Dallas,2010-01-01,150742.550106,1565.0,6216118,4052381,2163737,81047,32419,12157,...,3.27,,67.0,46.0,86.0,0.0,44.9,53.9,36.0,
4,Seattle,2010-01-01,333469.123267,1190.0,3340958,2150701,1190257,25808,19356,10753,...,6.18,,76.0,62.0,91.0,0.0,47.0,51.4,42.5,


### Fill outstanding nulls

In [22]:
nulls = merged_df.isnull().sum()
nulls[nulls > 0]

Unnamed: 0,0
dem_senate_cnt,20
rep_senate_cnt,20
dem_house_cnt,20
rep_house_cnt,20
other_senate_cnt,20
other_house_cnt,20
dem_spons_legis_cnt,20
rep_spons_legis_cnt,20
other_spons_legis_cnt,20
dem_cospons_legis_cnt,20


In [23]:
# fill days with thunderstorms & snowfall to 0 where null
# no 0's currently exist for storms implying that there is either some value or null
# snowfall is null for all LA records only, implying 0 snow
merged_df.days_with_thunderstorms.fillna(0, inplace = True)
merged_df.snowfall.fillna(0, inplace = True)
# sunshine fields are completely null...removing
merged_df.drop(['percent_of_possible_sunshine', 'total_sunshine'], axis = 1, inplace = True)
# humidity was randomly not collected at some points...fill with avg
merged_df.relative_humidity_avg = merged_df.groupby('City').relative_humidity_avg.transform(lambda x: x.fillna(x.mean()))
merged_df.relative_humidity_min = merged_df.groupby('City').relative_humidity_min.transform(lambda x: x.fillna(x.mean()))
merged_df.relative_humidity_max = merged_df.groupby('City').relative_humidity_max.transform(lambda x: x.fillna(x.mean()))

In [24]:
# nulls in political data are seemingly random...will forwards fill
nulls = merged_df.isnull().sum()
null_cols = list(nulls[nulls > 0].index)
merged_df['Date'] = pd.to_datetime(merged_df.Date)
merged_df.sort_values(['Date', 'City'], inplace = True)
merged_df[null_cols] = merged_df[null_cols].fillna(merged_df.groupby(['City'])[null_cols].ffill())

In [26]:
nulls = merged_df.isnull().sum()
nulls[nulls > 0].shape

(0,)

# Write CSV

In [27]:
trns_path = f'{shared_drive}/processed/merged.csv'
merged_df.to_csv(trns_path, index = False)