In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [9]:
df = pd.read_csv('master_disaster_resilience_data.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 22 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   fips_code                             1000 non-null   int64  
 1   incident_start_date                   1000 non-null   object 
 2   incidenttype                          1000 non-null   object 
 3   is_major_disaster                     1000 non-null   int64  
 4   state_x                               1000 non-null   object 
 5   incident_year                         1000 non-null   int64  
 6   stabr                                 974 non-null    object 
 7   area_name_x                           974 non-null    object 
 8   year_x                                974 non-null    float64
 9   medhhinc                              974 non-null    float64
 10  povall                                974 non-null    float64
 11  state_y           

In [11]:
df = df.rename(columns={
    'state_x': 'state',
    'area_name_x': 'area_name',
    'year_x': 'data_year'
})

In [12]:
correlation = df['medhhinc'].corr(df['median_household_income'])
print(f"Correlation between income columns: {correlation:.4f}")

Correlation between income columns: 0.9614


In [13]:
cols_to_drop = [
    'stabr', 'state_y', 'area_name_y', 'year_y',
    'median_household_income'
]

In [14]:
df = df.drop(columns=cols_to_drop, errors='ignore')

In [15]:
numeric_features = [
    'medhhinc', 'povall', 'unemployment_rate',
    'civilian_labor_force', 'employed', 'unemployed',
    'total_population', 'total_population_norm'
]

In [16]:
#handling missing features
for col in numeric_features:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

In [17]:
#Calculating the Priority Index (P)
df['vulnerability_ratio'] = df['povall'] / df['medhhinc'].replace(0, 1)

In [18]:
#Normalizing the Priority Index
scaler = MinMaxScaler()
df['priority_index'] = scaler.fit_transform(df[['vulnerability_ratio']])

In [19]:
df_final = pd.get_dummies(df, columns=['incidenttype'], prefix='type')

In [20]:
df_final.to_csv('final_preprocessed_training_data.csv', index=False)

In [21]:
df_final.head()

Unnamed: 0,fips_code,incident_start_date,is_major_disaster,state,incident_year,area_name,data_year,medhhinc,povall,civilian_labor_force,...,vulnerability_ratio,priority_index,type_Biological,type_Fire,type_Flood,type_Hurricane,type_Severe Storm,type_Straight-Line Winds,type_Tornado,type_Winter Storm
0,41067,2024-08-08 00:00:00+00:00,0,OR,2024,Washington County,2023.0,103486.0,48317.0,332643.0,...,0.466894,0.031004,False,True,False,False,False,False,False,False
1,41031,2024-08-04 00:00:00+00:00,0,OR,2024,Jefferson County,2023.0,68419.0,3390.0,10286.0,...,0.049548,0.003232,False,True,False,False,False,False,False,False
2,41017,2024-08-02 00:00:00+00:00,0,OR,2024,Deschutes County,2023.0,95414.0,17944.0,104662.0,...,0.188065,0.012449,False,True,False,False,False,False,False,False
3,6000,2017-02-08 00:00:00+00:00,1,CA,2017,,,65813.0,5775.5,19564.0,...,0.087756,0.005775,False,False,False,False,True,False,False,False
4,1001,2015-12-23 00:00:00+00:00,1,AL,2015,Autauga County,2023.0,68857.0,7004.0,27000.0,...,0.101718,0.006704,False,False,False,False,True,False,False,False
