# Drought Prediction

## Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import pickle

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import NearMiss

## Data Wrangling

#### Reading the input data

In [2]:
# drought_df_train = pd.read_csv('data/train_timeseries.csv')
# drought_df_test = pd.read_csv('data/test_timeseries.csv')
# drought_df_test = pd.read_csv('data/validation_timeseries.csv')

drought_df =  pd.read_csv('data/all_timeseries.csv')
drought_df.head()

Unnamed: 0,fips,date,PRECTOT,PS,QV2M,T2M,T2MDEW,T2MWET,T2M_MAX,T2M_MIN,...,WS10M_MIN,WS10M_RANGE,WS50M,WS50M_MAX,WS50M_MIN,WS50M_RANGE,score,year,month,day
0,1001,2000-01-04,15.95,100.29,6.42,11.4,6.09,6.1,18.09,2.16,...,2.08,3.59,6.73,9.31,3.74,5.58,1,2000,1,4
1,1001,2000-01-11,1.33,100.4,6.63,11.48,7.84,7.84,18.88,5.72,...,1.05,1.43,3.55,6.38,1.71,4.67,2,2000,1,11
2,1001,2000-01-18,1.11,100.39,9.53,14.28,13.26,13.26,18.04,8.98,...,1.67,1.92,5.19,6.4,3.84,2.55,2,2000,1,18
3,1001,2000-01-25,0.0,100.11,2.05,-0.78,-7.93,-7.72,5.65,-5.46,...,2.28,2.32,5.75,8.03,3.96,4.07,2,2000,1,25
4,1001,2000-02-01,0.0,101.0,3.36,2.06,-1.73,-1.7,11.02,-4.21,...,0.88,1.86,4.18,6.38,1.27,5.11,1,2000,2,1


In [8]:
drought_df.columns

Index(['fips', 'date', 'PRECTOT', 'PS', 'QV2M', 'T2M', 'T2MDEW', 'T2MWET',
       'T2M_MAX', 'T2M_MIN', 'T2M_RANGE', 'TS', 'WS10M', 'WS10M_MAX',
       'WS10M_MIN', 'WS10M_RANGE', 'WS50M', 'WS50M_MAX', 'WS50M_MIN',
       'WS50M_RANGE', 'score', 'year', 'month', 'day'],
      dtype='object')

#### Distribution of continuous variables

## Data Eng

In [11]:
drought_df.drop(columns=['fips','date'], inplace=True)

In [12]:
drought_vars = drought_df.drop(columns=['score'])

In [13]:
# X_train = drought_df_train.drop(['fips', 'date', 'score'], axis=1)
# y_train = drought_df_train['score']

# X_test = drought_df_test.drop(['fips', 'date', 'score'], axis=1)
# y_test = drought_df_test['score']


X_train, X_test, y_train, y_test = train_test_split(drought_vars, drought_df['score'], test_size=0.2, random_state=42)   

In [14]:
print("Train features shape", X_train.shape)
print("Train target shape", y_train.shape)
print("Test features shape", X_test.shape)
print("Test target shape", y_test.shape)

Train features shape (2722608, 21)
Train target shape (2722608,)
Test features shape (680652, 21)
Test target shape (680652,)


#### Standardizing the data

In [15]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Fixing class imbalance

In [16]:
# Upsampling using SMOTE
sm = SMOTE(random_state = 5)
X_train_ures_SMOTE, y_train_ures_SMOTE = sm.fit_resample(X_train, y_train.ravel())

  X_train_ures_SMOTE, y_train_ures_SMOTE = sm.fit_resample(X_train, y_train.ravel())


In [18]:
print('Before OverSampling, the shape of train_X: {}'.format(X_train.shape))
print('Before OverSampling, the shape of train_y: {} \n'.format(y_train.shape))

print('After OverSampling, the shape of train_X: {}'.format(X_train_ures_SMOTE.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_ures_SMOTE.shape))

print("Counts of label '0' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 0),sum(y_train_ures_SMOTE == 0)))
print("Counts of label '1' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 1),sum(y_train_ures_SMOTE == 1)))
print("Counts of label '2' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 2),sum(y_train_ures_SMOTE == 2)))
print("Counts of label '3' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 3),sum(y_train_ures_SMOTE == 3)))
print("Counts of label '4' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 4),sum(y_train_ures_SMOTE == 4)))
print("Counts of label '5' - Before Oversampling:{}, After OverSampling: {}".format(sum(y_train == 5),sum(y_train_ures_SMOTE == 5)))

Before OverSampling, the shape of train_X: (2722608, 21)
Before OverSampling, the shape of train_y: (2722608,) 

After OverSampling, the shape of train_X: (10044654, 21)
After OverSampling, the shape of train_y: (10044654,) 

Counts of label '0' - Before Oversampling:1674109, After OverSampling: 1674109
Counts of label '1' - Before Oversampling:464085, After OverSampling: 1674109
Counts of label '2' - Before Oversampling:282257, After OverSampling: 1674109
Counts of label '3' - Before Oversampling:177218, After OverSampling: 1674109
Counts of label '4' - Before Oversampling:92226, After OverSampling: 1674109
Counts of label '5' - Before Oversampling:32713, After OverSampling: 1674109


#### PCA

In [19]:
pca = PCA()
X_train_ures_SMOTE_PCAreduced = pca.fit_transform(X_train_ures_SMOTE)
X_test_SMOTE_PCA_transformed = pca.transform(X_test)

In [20]:
print(pca.explained_variance_ratio_)

[3.60752799e-01 2.33445467e-01 1.03661179e-01 6.95540526e-02
 5.02034100e-02 4.68868457e-02 4.41810402e-02 4.20141053e-02
 2.61907313e-02 9.02768732e-03 6.60694011e-03 3.17351746e-03
 1.85191061e-03 1.72714197e-03 3.99159016e-04 2.65649782e-04
 5.36328537e-05 4.59920489e-06 7.46393820e-08 5.37417171e-08
 3.54261366e-09]


In [21]:
print(f"X_train Shape: {X_train_ures_SMOTE_PCAreduced.shape}")
print(f"y_train Shape: {y_train_ures_SMOTE.shape}")

X_train Shape: (10044654, 21)
y_train Shape: (10044654,)


In [23]:
# obj0, obj1, obj2 are created here...

# Saving the objects:
with open('data\Xy_trainTest.pkl', 'wb') as f:  # Python 3: open(..., 'wb')
    pickle.dump([X_train, X_test, y_train, y_test], f)



In [27]:
# Getting back the objects:
with open('data\Xy_trainTest.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    X_train, X_test, y_train, y_test = pickle.load(f)