# Drought Prediction

## Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import pickle

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import NearMiss

## Data Wrangling

#### Reading the input data

In [4]:
# drought_df_train = pd.read_csv('data/train_timeseries.csv')
# drought_df_test = pd.read_csv('data/test_timeseries.csv')
# drought_df_test = pd.read_csv('data/validation_timeseries.csv')

drought_df =  pd.read_csv('data/all_timeseries.csv')
drought_df.head()

Unnamed: 0,fips,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,...,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score,year,month,day
0,1001,2000-01-04,15.95,100.29,6.42,11.4,6.09,6.1,18.09,2.16,...,2.08,3.59,6.73,9.31,3.74,5.58,1,2000,1,4
1,1001,2000-01-11,1.33,100.4,6.63,11.48,7.84,7.84,18.88,5.72,...,1.05,1.43,3.55,6.38,1.71,4.67,2,2000,1,11
2,1001,2000-01-18,1.11,100.39,9.53,14.28,13.26,13.26,18.04,8.98,...,1.67,1.92,5.19,6.4,3.84,2.55,2,2000,1,18
3,1001,2000-01-25,0.0,100.11,2.05,-0.78,-7.93,-7.72,5.65,-5.46,...,2.28,2.32,5.75,8.03,3.96,4.07,2,2000,1,25
4,1001,2000-02-01,0.0,101.0,3.36,2.06,-1.73,-1.7,11.02,-4.21,...,0.88,1.86,4.18,6.38,1.27,5.11,1,2000,2,1


In [5]:
drought_df.columns

Index(['fips', 'date', 'PRECTOT', 'PS', 'QV2M', 'T2M', 'T2MDEW', 'T2MWET',
       'T2M_MAX', 'T2M_MIN', 'T2M_RANGE', 'TS', 'WS10M', 'WS10M_MAX',
       'WS10M_MIN', 'WS10M_RANGE', 'WS50M', 'WS50M_MAX', 'WS50M_MIN',
       'WS50M_RANGE', 'score', 'year', 'month', 'day'],
      dtype='object')

In [6]:
soil_df = pd.read_csv('data/soil_data.csv')

## Data Eng

In [7]:
# Function to filter outliers based on the 3-sigma rule
def remove_outliers(df, columns):
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        upper_limit = mean + 3 * std
        lower_limit = mean - 3 * std
        df = df[(df[col] <= upper_limit) & (df[col] >= lower_limit)]
    return df

In [8]:
# Avoiding Categorical cols
measures = ['PRECTOT','PS','QV2M','T2M','T2MDEW','T2MWET','T2M_MAX','T2M_MIN','T2M_RANGE','TS','WS10M','WS10M_MAX','WS10M_MIN','WS10M_RANGE','WS50M','WS50M_MAX','WS50M_MIN','WS50M_RANGE']

# Remove outliers
cleaned_drought_df = remove_outliers(drought_df, measures)

# Print the number of rows before and after removing outliers
print(f'Total rows before removing outliers: {len(drought_df)}')
print(f'Total rows after removing outliers: {len(cleaned_drought_df)}')
print(f'Number of outliers: {len(drought_df)-len(cleaned_drought_df)}')

Total rows before removing outliers: 3403260
Total rows after removing outliers: 3054658
Number of outliers: 348602


In [9]:
# # Remove outliers

# # All cols expect fips
# cols = list(soil_df.drop(['fips'], axis=1).columns)

# cleaned_soil_df = remove_outliers(soil_df, cols)

# # Print the number of rows before and after removing outliers
# print(f'Total rows before removing outliers: {len(soil_df)}')
# print(f'Total rows after removing outliers: {len(cleaned_soil_df)}')
# print(f'Number of outliers: {len(soil_df)-len(cleaned_soil_df)}')

In [13]:
# Combine timeseries and soil data
combined_df = cleaned_drought_df.merge(soil_df, how='left', on='fips')

# Drop fips code and date
combined_df.drop(columns=['fips','date'], inplace=True)

# List columns
combined_df.columns

Index(['PRECTOT', 'PS', 'QV2M', 'T2M', 'T2MDEW', 'T2MWET', 'T2M_MAX',
       'T2M_MIN', 'T2M_RANGE', 'TS', 'WS10M', 'WS10M_MAX', 'WS10M_MIN',
       'WS10M_RANGE', 'WS50M', 'WS50M_MAX', 'WS50M_MIN', 'WS50M_RANGE',
       'score', 'year', 'month', 'day', 'lat', 'lon', 'elevation', 'slope1',
       'slope2', 'slope3', 'slope4', 'slope5', 'slope6', 'slope7', 'slope8',
       'aspectN', 'aspectE', 'aspectS', 'aspectW', 'aspectUnknown', 'WAT_LAND',
       'NVG_LAND', 'URB_LAND', 'GRS_LAND', 'FOR_LAND', 'CULTRF_LAND',
       'CULTIR_LAND', 'CULT_LAND', 'SQ1', 'SQ2', 'SQ3', 'SQ4', 'SQ5', 'SQ6',
       'SQ7'],
      dtype='object')

In [14]:
# Split to X,y train,test
X_train, X_test, y_train, y_test = train_test_split(combined_df.drop(columns=['score']), combined_df['score'], test_size=0.2, random_state=42)   

print("Train features shape", X_train.shape)
print("Train target shape", y_train.shape)
print("Test features shape", X_test.shape)
print("Test target shape", y_test.shape)

Train features shape (2443726, 52)
Train target shape (2443726,)
Test features shape (610932, 52)
Test target shape (610932,)


#### Standardizing the data

In [15]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fixing class imbalance

In [16]:
# Upsampling using SMOTE
sm = SMOTE(random_state = 5)
X_train_ures_SMOTE, y_train_ures_SMOTE = sm.fit_resample(X_train, y_train.ravel())

  X_train_ures_SMOTE, y_train_ures_SMOTE = sm.fit_resample(X_train, y_train.ravel())


In [17]:
print('Before OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('Before OverSampling, the shape of train_y: {} \n'.format(y_train.shape))

print('After OverSampling, the shape of train_X: {}'.format(X_train_ures_SMOTE.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_ures_SMOTE.shape))

print("Counts of label '0' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 0),sum(y_train_ures_SMOTE == 0)))
print("Counts of label '1' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 1),sum(y_train_ures_SMOTE == 1)))
print("Counts of label '2' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 2),sum(y_train_ures_SMOTE == 2)))
print("Counts of label '3' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 3),sum(y_train_ures_SMOTE == 3)))
print("Counts of label '4' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 4),sum(y_train_ures_SMOTE == 4)))
print("Counts of label '5' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 5),sum(y_train_ures_SMOTE == 5)))

Before OverSampling, the shape of train_X: (2443726, 52)
Before OverSampling, the shape of train_y: (2443726,) 

After OverSampling, the shape of train_X: (9103362, 52)
After OverSampling, the shape of train_y: (9103362,) 

Counts of label '0' - Before Oversampling:1517227, After OverSampling: 1517227
Counts of label '1' - Before Oversampling:412980, After OverSampling: 1517227
Counts of label '2' - Before Oversampling:249363, After OverSampling: 1517227
Counts of label '3' - Before Oversampling:155322, After OverSampling: 1517227
Counts of label '4' - Before Oversampling:79514, After OverSampling: 1517227
Counts of label '5' - Before Oversampling:29320, After OverSampling: 1517227


#### PCA

In [18]:
pca = PCA()
X_train_ures_SMOTE_PCAreduced = pca.fit_transform(X_train_ures_SMOTE)
X_test_SMOTE_PCA_transformed = pca.transform(X_test)

In [19]:
print(pca.explained_variance_ratio_)

[1.86640783e-01 1.64789967e-01 9.69241030e-02 8.71608053e-02
 7.18999624e-02 4.74643829e-02 3.52313530e-02 2.86831096e-02
 2.66474972e-02 2.42013134e-02 2.28222807e-02 1.98126887e-02
 1.80508414e-02 1.80282418e-02 1.71054400e-02 1.60981698e-02
 1.42803431e-02 1.16296010e-02 1.00580784e-02 9.64310198e-03
 8.89189248e-03 8.14791935e-03 7.26556868e-03 7.09963541e-03
 6.27328493e-03 5.81834095e-03 5.35582580e-03 3.66526267e-03
 3.48718102e-03 2.84125576e-03 2.51774065e-03 2.38704766e-03
 2.03092384e-03 1.23370175e-03 1.08161241e-03 7.58772375e-04
 7.25047682e-04 6.71654546e-04 6.31249887e-04 6.25187605e-04
 5.30179549e-04 4.86321737e-04 1.87302199e-04 9.53610263e-05
 1.80462363e-05 1.54791662e-06 4.19116679e-08 2.94164235e-08
 1.48756707e-09 1.97428594e-16 4.50340589e-17 0.00000000e+00]


In [23]:
print(f"X_train Shape: {X_train_ures_SMOTE_PCAreduced.shape}")
print(f"y_train Shape: {y_train_ures_SMOTE.shape}")

print(f"X_test  Shape: {X_test_SMOTE_PCA_transformed.shape}")
print(f"y_test  Shape: {y_test.shape}")

X_train Shape: (9103362, 52)
y_train Shape: (9103362,)
X_test  Shape: (610932, 52)
y_test  Shape: (610932,)


In [25]:
# Saving the objects:
with open('data\Xy_trainTest.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train_ures_SMOTE_PCAreduced, X_test_SMOTE_PCA_transformed, 
                 y_train_ures_SMOTE, y_test], f)

In [None]:
# Getting back the objects:
with open('data\Xy_trainTest.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    X_train, X_test, y_train, y_test = pickle.load(f)