# Data Processing

In [None]:
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from datetime import datetime
from workalendar.usa import Illinois

RAW_DATA_DIR = "../../data/raw data"
PROCESSED_DATA_DIR = "../../data/processed data"

SUB_FOLDER = "2020"

cal = Illinois()

## Communities Data

In [10]:
comunities = pd.read_csv(os.path.join(RAW_DATA_DIR, "chicago_communities_stat.csv"))

print("Original record count:", len(comunities))

Original record count: 77


### Data Cleaning

In [11]:
comunities = comunities[['OBJECTID', 'GEOID', 'GEOG',
                         'TOT_POP', 'MED_AGE',
                         'MEDINC', 'INCPERCAP', 'UNEMP', 'IN_LBFRC',
                         'OWN_OCC_HU', 'MED_HV',
                         'TOT_COMM', 'DROVE_AL', 'CARPOOL', 'TRANSIT', 'WALK_BIKE', 'COMM_OTHER', 'AGG_TT',
                         'NO_VEH', 'ONE_VEH', 'TWO_VEH', 'THREEOM_VEH',
                         'INTERNET',
                         'TRANSIT_LOW_PCT', 'TRANSIT_MOD_PCT', 'TRANSIT_HIGH_PCT',
                         'pct_pop_access_4_acres_per_1k', 'pct_pop_access_10_acres_per_1k']]

### Save Data

In [12]:
comunities.to_csv(os.path.join(PROCESSED_DATA_DIR, "cleaned_communities_data.csv"), index=False)

## ACS Data

In [13]:
acs = pd.read_csv(os.path.join(RAW_DATA_DIR, "chicago_acs_data.csv"))

print("Original record count:", len(acs))

Original record count: 1332


### Data Cleaning

In [14]:
acs = acs[acs["MEDINC"] >= 0]

print("Cleaned record count:", len(acs))

Cleaned record count: 1323


### Save Data

In [15]:
acs.to_csv(os.path.join(PROCESSED_DATA_DIR, "cleaned_acs_data.csv"), index=False)

## Weather Data

In [16]:
weather = pd.read_csv(os.path.join(RAW_DATA_DIR, SUB_FOLDER, "chicago_weather.csv"))

print("Original record count:", len(weather))

Original record count: 113


### Manipulation

In [17]:
weather["Date"] = pd.to_datetime(weather["Date"])
weather["Date"] = weather["Date"].dt.date

# Select weather variables
weather_features = ["Avg_Wind_Speed_mps", "Precipitation_mm", "Snowfall_mm", "Snow_Depth_mm", "Avg_Temp_C"]
df_weather = weather[weather_features]

# Standardize data (PCA is sensitive to scale, so standardization is required)
scaler = StandardScaler()
df_weather_scaled = scaler.fit_transform(df_weather)

# Perform PCA dimensionality reduction (retain the first principal component)
pca = PCA(n_components=1)
weather["WSI"] = pca.fit_transform(df_weather_scaled)

# Display WSI results
print(weather[["WSI"] + weather_features].head())

        WSI  Avg_Wind_Speed_mps  Precipitation_mm  Snowfall_mm  Snow_Depth_mm  \
0 -1.211753                 5.8               0.0          0.0            0.0   
1 -1.447672                 3.1               0.0          0.0            0.0   
2 -1.253279                 3.4               1.5          0.0            0.0   
3 -0.295030                 6.7              13.7          0.0            0.0   
4 -0.931124                 4.3               9.4          0.0            0.0   

   Avg_Temp_C  
0        10.3  
1         6.7  
2         5.7  
3         8.9  
4         8.9  


### Save Data

In [18]:
weather.to_csv(os.path.join(PROCESSED_DATA_DIR, SUB_FOLDER, "cleaned_weather_data.csv"), index=False)

## Public Transportation

In [19]:
public_transportation = pd.read_csv(os.path.join(RAW_DATA_DIR, "chicago_public_transportation.csv"))

print("Original record count:", len(public_transportation))

Original record count: 8828


### Manipulation

In [20]:
public_transportation["service_date"] = pd.to_datetime(public_transportation["service_date"]).dt.date

### Save Data

In [21]:
public_transportation.to_csv(os.path.join(PROCESSED_DATA_DIR, "cleaned_public_transportation_data.csv"), index=False)