In [1]:
import pandas as pd

df_decl = pd.read_csv("data/us_disaster_declarations.csv")
df_events = pd.read_csv("data/us_disasters_m5.csv")

print(df_decl.shape)
print(df_events.shape)

df_decl.head()


FileNotFoundError: [Errno 2] No such file or directory: 'data/us_disaster_declarations.csv'

In [2]:
import pandas as pd

df_decl = pd.read_csv("../data/us_disaster_declarations.csv")
df_events = pd.read_csv("../data/us_disasters_m5.csv")

print("Declarations dataset shape:", df_decl.shape)
print("Events dataset shape:", df_events.shape)

df_decl.head()


Declarations dataset shape: (68485, 27)
Events dataset shape: (627, 14)


Unnamed: 0,fema_declaration_string,disaster_number,state,declaration_type,declaration_date,fy_declared,incident_type,declaration_title,ih_program_declared,ia_program_declared,...,place_code,designated_area,declaration_request_number,last_ia_filing_date,incident_id,region,designated_incident_types,last_refresh,hash,id
0,DR-1-GA,1,GA,DR,1953-05-02T00:00:00Z,1953,Tornado,Tornado,0,1,...,0,Statewide,53013,,53013,4,,2024-08-27T18:22:14Z,413ff808d79f08a6710f6b78f361d5a7de692711,8943dfcf-9786-4e51-8889-d62014034bb2
1,DR-2-TX,2,TX,DR,1953-05-15T00:00:00Z,1953,Tornado,Tornado & Heavy Rainfall,0,1,...,0,Statewide,53003,,53003,6,W,2024-08-27T18:22:14Z,8a8bc885c003cb873c201bb6a3a2771a6d84efb1,ff821327-6b90-4246-b19f-fff8c4b288a8
2,DR-3-LA,3,LA,DR,1953-05-29T00:00:00Z,1953,Flood,Flood,0,1,...,0,Statewide,53005,,53005,6,,2024-08-27T18:22:14Z,b6e6f19ae3c0d2383b7b873b8495bd2770f2ff9a,cd461e08-5ac9-4e70-8507-9c7a3cbff265
3,DR-4-MI,4,MI,DR,1953-06-02T00:00:00Z,1953,Tornado,Tornado,0,1,...,0,Statewide,53004,,53004,5,,2024-08-27T18:22:14Z,34f0061012c8069f145d56a3537cd327b7d4e49b,53be0c04-d2ae-42fb-b070-a01b0a50b7f6
4,DR-5-MT,5,MT,DR,1953-06-06T00:00:00Z,1953,Flood,Floods,0,1,...,0,Statewide,53006,,53006,8,,2024-08-27T18:22:14Z,3bdbec258e4640c3f02971dbc1f9dbc3ebbfc96a,4b3ed0ac-299b-49f0-80d4-9a2a6bacd5a4


In [3]:
df_decl.columns


Index(['fema_declaration_string', 'disaster_number', 'state',
       'declaration_type', 'declaration_date', 'fy_declared', 'incident_type',
       'declaration_title', 'ih_program_declared', 'ia_program_declared',
       'pa_program_declared', 'hm_program_declared', 'incident_begin_date',
       'incident_end_date', 'disaster_closeout_date', 'tribal_request', 'fips',
       'place_code', 'designated_area', 'declaration_request_number',
       'last_ia_filing_date', 'incident_id', 'region',
       'designated_incident_types', 'last_refresh', 'hash', 'id'],
      dtype='object')

In [4]:
df_events.columns


Index(['disaster_number', 'state', 'declaration_type', 'declaration_date',
       'incident_type', 'declaration_title', 'ih_program_declared',
       'ia_program_declared', 'pa_program_declared', 'hm_program_declared',
       'incident_begin_date', 'incident_end_date', 'fips', 'designated_area'],
      dtype='object')

In [5]:
df_decl.isnull().sum().sort_values(ascending=False).head(10)


last_ia_filing_date           49090
designated_incident_types     47812
disaster_closeout_date        16417
incident_end_date               522
hash                              0
last_refresh                      0
region                            0
incident_id                       0
declaration_request_number        0
designated_area                   0
dtype: int64

In [6]:
cols_to_drop = [
    "last_ia_filing_date",
    "designated_incident_types",
    "hash",
    "id"
]

df_decl_clean = df_decl.drop(columns=cols_to_drop)


In [7]:
date_cols = [
    "declaration_date",
    "incident_begin_date",
    "incident_end_date",
    "disaster_closeout_date"
]

for col in date_cols:
    df_decl_clean[col] = pd.to_datetime(df_decl_clean[col], errors="coerce")


In [8]:
df_decl_clean.dtypes


fema_declaration_string                    object
disaster_number                             int64
state                                      object
declaration_type                           object
declaration_date              datetime64[ns, UTC]
fy_declared                                 int64
incident_type                              object
declaration_title                          object
ih_program_declared                         int64
ia_program_declared                         int64
pa_program_declared                         int64
hm_program_declared                         int64
incident_begin_date           datetime64[ns, UTC]
incident_end_date             datetime64[ns, UTC]
disaster_closeout_date        datetime64[ns, UTC]
tribal_request                              int64
fips                                        int64
place_code                                  int64
designated_area                            object
declaration_request_number                  int64


In [9]:
df_decl_clean["disaster_duration_days"] = (
    df_decl_clean["incident_end_date"] - df_decl_clean["incident_begin_date"]
).dt.days


In [10]:
severity_cols = [
    "ih_program_declared",
    "ia_program_declared",
    "pa_program_declared",
    "hm_program_declared"
]

df_decl_clean["severity_score"] = df_decl_clean[severity_cols].sum(axis=1)


In [11]:
df_decl_clean["incident_year"] = df_decl_clean["incident_begin_date"].dt.year
df_decl_clean["incident_month"] = df_decl_clean["incident_begin_date"].dt.month
df_decl_clean["incident_quarter"] = df_decl_clean["incident_begin_date"].dt.quarter


In [12]:
ml_features = df_decl_clean[
    [
        "state",
        "region",
        "incident_type",
        "declaration_type",
        "severity_score",
        "disaster_duration_days",
        "incident_year",
        "incident_month",
        "incident_quarter",
    ]
]


In [13]:
ml_features.head()


Unnamed: 0,state,region,incident_type,declaration_type,severity_score,disaster_duration_days,incident_year,incident_month,incident_quarter
0,GA,4,Tornado,DR,3,0.0,1953,5,2
1,TX,6,Tornado,DR,3,0.0,1953,5,2
2,LA,6,Flood,DR,3,0.0,1953,5,2
3,MI,5,Tornado,DR,3,0.0,1953,6,2
4,MT,8,Flood,DR,3,0.0,1953,6,2
