In [1]:
import pandas as pd
import datetime
import numpy as np

In [2]:
# Loading Drought CSV file
drought_load = "../Resources/Drought_ByRegion_PercentArea_Categorical_2000-2023.csv"

# Reading the Drought data
drought_df = pd.read_csv(drought_load)
drought_df.head()

Unnamed: 0,MapDate,Region,None,D0,D1,D2,D3,D4,ValidStart,ValidEnd,StatisticFormatID
0,20230131,High Plains,16.96,18.32,31.46,16.13,9.55,7.57,1/31/2023,2/6/2023,2
1,20230131,West,18.89,20.56,31.72,22.45,6.24,0.14,1/31/2023,2/6/2023,2
2,20230124,High Plains,15.12,20.02,31.06,16.58,9.66,7.57,1/24/2023,1/30/2023,2
3,20230124,West,18.12,22.49,30.55,22.41,6.28,0.14,1/24/2023,1/30/2023,2
4,20230117,High Plains,13.84,20.16,30.92,17.04,10.38,7.66,1/17/2023,1/23/2023,2


In [3]:
# Data types
drought_df.dtypes

MapDate                int64
Region                object
None                 float64
D0                   float64
D1                   float64
D2                   float64
D3                   float64
D4                   float64
ValidStart            object
ValidEnd              object
StatisticFormatID      int64
dtype: object

In [4]:
# Check for null values
drought_df.isna().sum()

MapDate              0
Region               0
None                 0
D0                   0
D1                   0
D2                   0
D3                   0
D4                   0
ValidStart           0
ValidEnd             0
StatisticFormatID    0
dtype: int64

In [5]:
# Convert the 'MapDate' column to datetime 
drought_df['MapDate'] = pd.to_datetime(drought_df['MapDate'],format = "%Y%m%d")
drought_df.dtypes

MapDate              datetime64[ns]
Region                       object
None                        float64
D0                          float64
D1                          float64
D2                          float64
D3                          float64
D4                          float64
ValidStart                   object
ValidEnd                     object
StatisticFormatID             int64
dtype: object

In [6]:
# Check data table to confirm datetime conversion
drought_df

Unnamed: 0,MapDate,Region,None,D0,D1,D2,D3,D4,ValidStart,ValidEnd,StatisticFormatID
0,2023-01-31,High Plains,16.96,18.32,31.46,16.13,9.55,7.57,1/31/2023,2/6/2023,2
1,2023-01-31,West,18.89,20.56,31.72,22.45,6.24,0.14,1/31/2023,2/6/2023,2
2,2023-01-24,High Plains,15.12,20.02,31.06,16.58,9.66,7.57,1/24/2023,1/30/2023,2
3,2023-01-24,West,18.12,22.49,30.55,22.41,6.28,0.14,1/24/2023,1/30/2023,2
4,2023-01-17,High Plains,13.84,20.16,30.92,17.04,10.38,7.66,1/17/2023,1/23/2023,2
...,...,...,...,...,...,...,...,...,...,...,...
2405,2000-01-18,West,52.42,47.41,0.16,0.01,0.00,0.00,1/18/2000,1/24/2000,2
2406,2000-01-11,High Plains,42.85,43.68,5.07,8.40,0.00,0.00,1/11/2000,1/17/2000,2
2407,2000-01-11,West,55.28,44.55,0.16,0.01,0.00,0.00,1/11/2000,1/17/2000,2
2408,2000-01-04,High Plains,47.23,40.73,6.04,6.01,0.00,0.00,1/4/2000,1/10/2000,2


In [7]:
# Seperate month and year into seperate columns
drought_df['year'] = drought_df['MapDate'].dt.year
drought_df['month'] = drought_df['MapDate'].dt.month
drought_df

Unnamed: 0,MapDate,Region,None,D0,D1,D2,D3,D4,ValidStart,ValidEnd,StatisticFormatID,year,month
0,2023-01-31,High Plains,16.96,18.32,31.46,16.13,9.55,7.57,1/31/2023,2/6/2023,2,2023,1
1,2023-01-31,West,18.89,20.56,31.72,22.45,6.24,0.14,1/31/2023,2/6/2023,2,2023,1
2,2023-01-24,High Plains,15.12,20.02,31.06,16.58,9.66,7.57,1/24/2023,1/30/2023,2,2023,1
3,2023-01-24,West,18.12,22.49,30.55,22.41,6.28,0.14,1/24/2023,1/30/2023,2,2023,1
4,2023-01-17,High Plains,13.84,20.16,30.92,17.04,10.38,7.66,1/17/2023,1/23/2023,2,2023,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2405,2000-01-18,West,52.42,47.41,0.16,0.01,0.00,0.00,1/18/2000,1/24/2000,2,2000,1
2406,2000-01-11,High Plains,42.85,43.68,5.07,8.40,0.00,0.00,1/11/2000,1/17/2000,2,2000,1
2407,2000-01-11,West,55.28,44.55,0.16,0.01,0.00,0.00,1/11/2000,1/17/2000,2,2000,1
2408,2000-01-04,High Plains,47.23,40.73,6.04,6.01,0.00,0.00,1/4/2000,1/10/2000,2,2000,1


In [8]:
# Convert month number to month name
drought_df['month'] = pd.to_datetime(drought_df['month'], format='%m').dt.month_name().str.slice(stop=3)
drought_df

Unnamed: 0,MapDate,Region,None,D0,D1,D2,D3,D4,ValidStart,ValidEnd,StatisticFormatID,year,month
0,2023-01-31,High Plains,16.96,18.32,31.46,16.13,9.55,7.57,1/31/2023,2/6/2023,2,2023,Jan
1,2023-01-31,West,18.89,20.56,31.72,22.45,6.24,0.14,1/31/2023,2/6/2023,2,2023,Jan
2,2023-01-24,High Plains,15.12,20.02,31.06,16.58,9.66,7.57,1/24/2023,1/30/2023,2,2023,Jan
3,2023-01-24,West,18.12,22.49,30.55,22.41,6.28,0.14,1/24/2023,1/30/2023,2,2023,Jan
4,2023-01-17,High Plains,13.84,20.16,30.92,17.04,10.38,7.66,1/17/2023,1/23/2023,2,2023,Jan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2405,2000-01-18,West,52.42,47.41,0.16,0.01,0.00,0.00,1/18/2000,1/24/2000,2,2000,Jan
2406,2000-01-11,High Plains,42.85,43.68,5.07,8.40,0.00,0.00,1/11/2000,1/17/2000,2,2000,Jan
2407,2000-01-11,West,55.28,44.55,0.16,0.01,0.00,0.00,1/11/2000,1/17/2000,2,2000,Jan
2408,2000-01-04,High Plains,47.23,40.73,6.04,6.01,0.00,0.00,1/4/2000,1/10/2000,2,2000,Jan


In [9]:
# Filter out region data to only show "West"
filtered_drought_df = drought_df[drought_df['Region']=='West']
filtered_drought_df

Unnamed: 0,MapDate,Region,None,D0,D1,D2,D3,D4,ValidStart,ValidEnd,StatisticFormatID,year,month
1,2023-01-31,West,18.89,20.56,31.72,22.45,6.24,0.14,1/31/2023,2/6/2023,2,2023,Jan
3,2023-01-24,West,18.12,22.49,30.55,22.41,6.28,0.14,1/24/2023,1/30/2023,2,2023,Jan
5,2023-01-17,West,16.80,23.83,26.94,24.95,7.35,0.14,1/17/2023,1/23/2023,2,2023,Jan
7,2023-01-10,West,13.72,25.58,26.34,25.61,8.62,0.14,1/10/2023,1/16/2023,2,2023,Jan
9,2023-01-03,West,12.08,25.51,23.57,26.43,12.14,0.27,1/3/2023,1/9/2023,2,2023,Jan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2401,2000-02-01,West,56.94,42.11,0.94,0.01,0.00,0.00,2/1/2000,2/7/2000,2,2000,Feb
2403,2000-01-25,West,49.76,50.07,0.16,0.01,0.00,0.00,1/25/2000,1/31/2000,2,2000,Jan
2405,2000-01-18,West,52.42,47.41,0.16,0.01,0.00,0.00,1/18/2000,1/24/2000,2,2000,Jan
2407,2000-01-11,West,55.28,44.55,0.16,0.01,0.00,0.00,1/11/2000,1/17/2000,2,2000,Jan


In [10]:
# Drop MapDate, ValidStart, ValidEnd, and StatisticFormatID columns
new_drought_df = filtered_drought_df.drop(['MapDate','ValidStart','ValidEnd','StatisticFormatID'], axis=1)
new_drought_df

Unnamed: 0,Region,None,D0,D1,D2,D3,D4,year,month
1,West,18.89,20.56,31.72,22.45,6.24,0.14,2023,Jan
3,West,18.12,22.49,30.55,22.41,6.28,0.14,2023,Jan
5,West,16.80,23.83,26.94,24.95,7.35,0.14,2023,Jan
7,West,13.72,25.58,26.34,25.61,8.62,0.14,2023,Jan
9,West,12.08,25.51,23.57,26.43,12.14,0.27,2023,Jan
...,...,...,...,...,...,...,...,...,...
2401,West,56.94,42.11,0.94,0.01,0.00,0.00,2000,Feb
2403,West,49.76,50.07,0.16,0.01,0.00,0.00,2000,Jan
2405,West,52.42,47.41,0.16,0.01,0.00,0.00,2000,Jan
2407,West,55.28,44.55,0.16,0.01,0.00,0.00,2000,Jan


In [11]:
# Get averages by month & year
final_drought_df = new_drought_df.groupby(['year', 'month'], as_index=False).mean().sort_values(['year', 'month'],ascending=True)
final_drought_df.head(20)

Unnamed: 0,year,month,None,D0,D1,D2,D3,D4
0,2000,Apr,78.8375,16.0375,3.485,1.64,0.0,0.0
1,2000,Aug,33.954,41.568,18.802,5.588,0.086,0.0
2,2000,Dec,79.4475,13.085,4.8625,2.4025,0.2,0.0
3,2000,Feb,64.482,30.818,4.568,0.13,0.0,0.0
4,2000,Jan,59.47,40.36,0.1625,0.01,0.0,0.0
5,2000,Jul,45.0175,40.5175,14.2,0.2625,0.0,0.0
6,2000,Jun,58.035,26.365,9.73,4.8825,0.9825,0.0
7,2000,Mar,81.71,12.185,5.1675,0.9375,0.0,0.0
8,2000,May,74.394,15.01,6.984,3.338,0.272,0.0
9,2000,Nov,71.8875,20.4025,5.0375,2.435,0.235,0.0
