# DATA WRANGLING: NCEI Data Files

## Sid Gurajala
## 11/18/2024

### Read In Files/Import Libraries

In [2]:
import pandas as pd
import os 

cwd = os.getcwd()
data_dir = os.path.join(cwd, "../../data/")

#join file paths
drought_severity_file = os.path.join(data_dir, "raw/NCEI_drought_severity_index_data_raw.csv")
precipitation_file = os.path.join(data_dir, "raw/NCEI_precipitation_data_raw.csv")
temperature_file = os.path.join(data_dir, "raw/NCEI_temperature_data_raw.csv")
#read in 
drought_severity_raw = pd.read_csv(drought_severity_file)
precipitation_raw = pd.read_csv(precipitation_file)
temperature_raw = pd.read_csv(temperature_file)

### Filtering to Fire Season

First we need to convert the date columns of our NCEI files to a pandas datetime object type.

In [3]:
#convert to datetime
drought_severity_raw.Date = pd.to_datetime(drought_severity_raw.Date, format='%Y%m')
temperature_raw.Date = pd.to_datetime(temperature_raw.Date, format='%Y%m')
precipitation_raw.Date = pd.to_datetime(precipitation_raw.Date, format='%Y%m')

Next we filter to just fire season, between may 1st and october 31st. 

In [4]:
#filter month
drought_severity_filtered = drought_severity_raw[(drought_severity_raw.Date.dt.month) >= 5 & (drought_severity_raw.Date.dt.month <= 10)]
temperature_filtered = temperature_raw[(temperature_raw.Date.dt.month) >= 5 & (temperature_raw.Date.dt.month <= 10)]
precipitation_filtered = precipitation_raw[(precipitation_raw.Date.dt.month) >= 5 & (precipitation_raw.Date.dt.month <= 10)]

### Aggregating the data by Year

Next we need to extract the year from the date.

In [5]:
#Extract year
drought_severity_filtered["Year"] = drought_severity_filtered.Date.dt.year
temperature_filtered["Year"] = temperature_filtered.Date.dt.year
precipitation_filtered["Year"] = precipitation_filtered.Date.dt.year

We next aggregate by year.

In [6]:
#aggregate by year
drought_severity_aggregated = drought_severity_filtered.groupby("Year").agg({"Value" : "mean"}).reset_index()
temperature_aggregated = temperature_filtered.groupby("Year").agg({"Value" : "mean"}).reset_index()
precipitation_aggregated = precipitation_filtered.groupby("Year").agg({"Value" : "mean"}).reset_index()

### Joining the DataFrames

We need to make the column names more explicit before joining.

In [7]:
#rename columns
drought_severity_aggregated.columns = ["Year", "avg_Drought_Severity_Index"]
temperature_aggregated.columns = ["Year", "avg_Temperature"]
precipitation_aggregated.columns = ["Year", "avg_Precipitation"]

Next we join by year.

In [8]:
ncei_final_df = drought_severity_aggregated.merge(
                        temperature_aggregated, 
                        how = "inner", on = "Year"
                        ).merge(
                                precipitation_aggregated, 
                                how = "inner", on = "Year"
                        )

### Write Out Data

Finally, we write out the file

In [9]:
ncei_final_df.to_csv(os.path.join(data_dir, "final/ncei_cleaned_final.csv"), index = False)