# Drought Prediction

## Load Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
import pickle

In [3]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NeighbourhoodCleaningRule
from imblearn.under_sampling import NearMiss

In [4]:
# Import PyTorch libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torchviz import make_dot

## Data Wrangling

#### Reading the input data

In [6]:
drought_df =  pd.read_csv('data/all_timeseries.csv')

with open('data\Xy_trainTest.pkl', 'rb') as f:  # Python 3: open(..., 'rb')
    X_train, X_test, y_train, y_test = pickle.load(f)

## PyTorch

#### Model 1

In [7]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

In [8]:
# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

Architecture: It has one input layer, two hidden layers (both with the same number of neurons), and one output layer.  
Activation Function: ReLU activation function is applied after each hidden layer.

In [9]:
# Define the neural network model
class DroughtNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(DroughtNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [10]:
# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
hidden_dim = 128
output_dim = len(set(y_train.tolist()))

model = DroughtNet(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [20]:
# Train the model
num_epochs = 3

train_history = {'loss': [], 'accuracy': [], 'validation_loss': [], 'validation_accuracy': []}

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * X_batch.size(0)
        
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    train_history['loss'].append(epoch_loss)
    train_history['accuracy'].append(epoch_accuracy)
    
    # Calculate validation loss and accuracy
    model.eval()
    val_running_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for X_val, y_val in test_loader:
            outputs = model(X_val)
            val_loss = criterion(outputs, y_val)
            val_running_loss += val_loss.item() * X_val.size(0)
            
            _, val_predicted = torch.max(outputs, 1)
            val_correct += (val_predicted == y_val).sum().item()
            val_total += y_val.size(0)
    
    val_epoch_loss = val_running_loss / len(test_loader.dataset)
    val_epoch_accuracy = val_correct / val_total
    train_history['validation_loss'].append(val_epoch_loss)
    train_history['validation_accuracy'].append(val_epoch_accuracy)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_epoch_accuracy:.4f}')

# Save the trained model and associated data
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_history': train_history,
}, 'saved_model.pth')

Epoch 1/3, Loss: 0.9895, Accuracy: 0.6303, Validation Loss: 0.9396, Validation Accuracy: 0.6417
Epoch 2/3, Loss: 0.9193, Accuracy: 0.6458, Validation Loss: 0.9040, Validation Accuracy: 0.6497
Epoch 3/3, Loss: 0.8957, Accuracy: 0.6515, Validation Loss: 0.8872, Validation Accuracy: 0.6538


In [21]:
# # Load the saved model
# checkpoint = torch.load('saved_model.pth')

# # Load the optimizer state
# optimizer = optim.Adam(model.parameters(), lr=0.001)
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# # Retrieve training history
# train_history = checkpoint['train_history']

In [22]:
# Evaluate the model
model.eval()
y_pred_list = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        outputs = model(X_batch)
        _, y_pred = torch.max(outputs, 1)
        y_pred_list.append(y_pred.numpy())

# Flatten the list of predictions
y_pred = np.concatenate(y_pred_list)

In [23]:
# Convert y_test to numpy array if not already
y_test_numpy = y_test.to_numpy() if not isinstance(y_test, np.ndarray) else y_test

# Convert y_pred to numpy array
y_pred_numpy = np.concatenate(y_pred_list)

# Check the unique values in both y_test and y_pred
print(f"Unique values in y_test: {np.unique(y_test_numpy)}")
print(f"Unique values in y_pred: {np.unique(y_pred_numpy)}")

# Ensure y_pred is a 1D array
y_pred_flat = y_pred_numpy.flatten()

# Check the shapes to ensure they match
print(f"Shape of y_test: {y_test_numpy.shape}")
print(f"Shape of y_pred: {y_pred_flat.shape}")

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_numpy, y_pred_flat)
report = classification_report(y_test_numpy, y_pred_flat)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Unique values in y_test: [0 1 2 3 4 5]
Unique values in y_pred: [0 1 2 3 4 5]
Shape of y_test: (680652,)
Shape of y_pred: (680652,)
Accuracy: 0.6538480750809518
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.95      0.83    419153
           1       0.34      0.15      0.21    115637
           2       0.32      0.15      0.20     70070
           3       0.32      0.27      0.29     44252
           4       0.38      0.25      0.30     23193
           5       0.43      0.33      0.37      8347

    accuracy                           0.65    680652
   macro avg       0.42      0.35      0.37    680652
weighted avg       0.58      0.65      0.60    680652



In [26]:
# Visualize the model architecture
# make_dot(model(X_train_tensor), params=dict(model.named_parameters()))

#### Model 2

Architecture: It has one input layer, a variable number of hidden layers (as specified by the hidden_dims list), and one output layer.  
Activation Function: ReLU activation function is applied after each hidden layer.  
Flexibility: High, as the number and size of hidden layers can be adjusted by passing different hidden_dims lists.

In [27]:
class DroughtNetComplex(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim):
        super(DroughtNetComplex, self).__init__()
        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList([nn.Linear(hidden_dims[i], hidden_dims[i+1]) for i in range(len(hidden_dims)-1)])
        self.output_layer = nn.Linear(hidden_dims[-1], output_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.relu(self.input_layer(x))
        for layer in self.hidden_layers:
            out = self.relu(layer(out))
        out = self.output_layer(out)
        return out

In [28]:
# Define the dimensions for hidden layers
hidden_dims = [256, 128, 64]  # Example: Three hidden layers with 256, 128, and 64 neurons respectively

In [29]:
# Initialize the model with increased complexity
model_complex = DroughtNetComplex(input_dim, hidden_dims, output_dim)

# Define the optimizer and criterion (loss function)
optimizer_complex = optim.Adam(model_complex.parameters(), lr=0.001)
criterion_complex = nn.CrossEntropyLoss()

In [31]:
# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
hidden_dims = [128, 64, 32]  # Example hidden dimensions, you can adjust as needed
output_dim = len(set(y_train.tolist()))

model = DroughtNetComplex(input_dim, hidden_dims, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 3

train_history = {'loss': [], 'accuracy': [], 'validation_loss': [], 'validation_accuracy': []}

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * X_batch.size(0)
        
        _, predicted = torch.max(outputs, 1)
        correct += (predicted == y_batch).sum().item()
        total += y_batch.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_accuracy = correct / total
    train_history['loss'].append(epoch_loss)
    train_history['accuracy'].append(epoch_accuracy)
    
    # Calculate validation loss and accuracy
    model.eval()
    val_running_loss = 0.0
    val_correct = 0
    val_total = 0
    with torch.no_grad():
        for X_val, y_val in test_loader:
            outputs = model(X_val)
            val_loss = criterion(outputs, y_val)
            val_running_loss += val_loss.item() * X_val.size(0)
            
            _, val_predicted = torch.max(outputs, 1)
            val_correct += (val_predicted == y_val).sum().item()
            val_total += y_val.size(0)
    
    val_epoch_loss = val_running_loss / len(test_loader.dataset)
    val_epoch_accuracy = val_correct / val_total
    train_history['validation_loss'].append(val_epoch_loss)
    train_history['validation_accuracy'].append(val_epoch_accuracy)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_epoch_accuracy:.4f}')

# Save the trained model and associated data
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'train_history': train_history,
}, 'saved_model2.pth')

Epoch 1/3, Loss: 0.9778, Accuracy: 0.6325, Validation Loss: 0.9253, Validation Accuracy: 0.6434
Epoch 2/3, Loss: 0.9115, Accuracy: 0.6467, Validation Loss: 0.9024, Validation Accuracy: 0.6489
Epoch 3/3, Loss: 0.8910, Accuracy: 0.6516, Validation Loss: 0.8805, Validation Accuracy: 0.6545


In [32]:
# Evaluate the complex model
y_pred_list = []

with torch.no_grad():
    for X_batch, _ in test_loader:
        outputs = model(X_batch)
        _, y_pred = torch.max(outputs, 1)
        y_pred_list.append(y_pred.numpy())

# Flatten the list of predictions
y_pred = np.concatenate(y_pred_list)


In [33]:
# Convert y_test to numpy array if not already
y_test_numpy = y_test.to_numpy() if not isinstance(y_test, np.ndarray) else y_test

# Convert y_pred to numpy array
y_pred_numpy = np.concatenate(y_pred_list)

# Check the unique values in both y_test and y_pred
print(f"Unique values in y_test: {np.unique(y_test_numpy)}")
print(f"Unique values in y_pred: {np.unique(y_pred_numpy)}")

# Ensure y_pred is a 1D array
y_pred_flat = y_pred_numpy.flatten()

# Check the shapes to ensure they match
print(f"Shape of y_test: {y_test_numpy.shape}")
print(f"Shape of y_pred: {y_pred_flat.shape}")

# Calculate evaluation metrics
accuracy = accuracy_score(y_test_numpy, y_pred_flat)
report = classification_report(y_test_numpy, y_pred_flat)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Unique values in y_test: [0 1 2 3 4 5]
Unique values in y_pred: [0 1 2 3 4 5]
Shape of y_test: (680652,)
Shape of y_pred: (680652,)
Accuracy: 0.6544754147493873
Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.95      0.83    419153
           1       0.34      0.14      0.20    115637
           2       0.32      0.17      0.22     70070
           3       0.35      0.20      0.26     44252
           4       0.34      0.31      0.32     23193
           5       0.45      0.29      0.35      8347

    accuracy                           0.65    680652
   macro avg       0.42      0.34      0.36    680652
weighted avg       0.58      0.65      0.60    680652



#### Random Search Space for hidden layer size/amount

In [34]:
import random

# Define a function to create the model with variable hidden dimensions
def create_model(input_dim, hidden_dims, output_dim):
    return DroughtNetComplex(input_dim, hidden_dims, output_dim)

# Define a function to train the model
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs):
    train_history = {'loss': [], 'accuracy': [], 'validation_loss': [], 'validation_accuracy': []}

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * X_batch.size(0)
            
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == y_batch).sum().item()
            total += y_batch.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct / total
        train_history['loss'].append(epoch_loss)
        train_history['accuracy'].append(epoch_accuracy)
        
        # Calculate validation loss and accuracy
        model.eval()
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for X_val, y_val in test_loader:
                outputs = model(X_val)
                val_loss = criterion(outputs, y_val)
                val_running_loss += val_loss.item() * X_val.size(0)
                
                _, val_predicted = torch.max(outputs, 1)
                val_correct += (val_predicted == y_val).sum().item()
                val_total += y_val.size(0)
        
        val_epoch_loss = val_running_loss / len(test_loader.dataset)
        val_epoch_accuracy = val_correct / val_total
        train_history['validation_loss'].append(val_epoch_loss)
        train_history['validation_accuracy'].append(val_epoch_accuracy)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}, Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_epoch_accuracy:.4f}')
    
    return train_history

In [37]:
def generate_hidden_layer_choices(num_choices, min_layers, max_layers, min_neurons, max_neurons):
    hidden_layer_choices = []
    
    for _ in range(num_choices):
        # Randomly choose the number of layers
        num_layers = random.randint(min_layers, max_layers)
        
        # Randomly choose the number of neurons for each layer
        layers = [random.randint(min_neurons, max_neurons) for _ in range(num_layers)]
        
        hidden_layer_choices.append(layers)
    
    return hidden_layer_choices


In [38]:
# Parameters for hidden layer generation
num_choices = 100  # Number of different hidden layer configurations to generate
min_layers = 1    # Minimum number of hidden layers
max_layers = 5    # Maximum number of hidden layers
min_neurons = 8  # Minimum number of neurons in each hidden layer
max_neurons = 1024 # Maximum number of neurons in each hidden layer

# Generate hidden layer choices
hidden_layer_choices = generate_hidden_layer_choices(num_choices, min_layers, max_layers, min_neurons, max_neurons)
print(f'Hyperparameter random search space: {hidden_layer_choices}')

# Number of random searches
num_searches = 10

# Store the best model and its performance
best_model = None
best_val_accuracy = 0
best_hidden_dims = None

for i in range(num_searches):
    print(f"Random Search {i+1}/{num_searches}")
    hidden_dims = random.choice(hidden_layer_choices)
    model = create_model(input_dim, hidden_dims, output_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_history = train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs)
    
    # Get the validation accuracy of the final epoch
    val_accuracy = train_history['validation_accuracy'][-1]
    
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = model
        best_hidden_dims = hidden_dims

print(f"Best validation accuracy: {best_val_accuracy}")
print(f"Best hidden dimensions: {best_hidden_dims}")


Hyperparameter random search space: [[989, 614, 167], [36, 796, 477], [397, 324], [881, 626, 379], [672], [313, 985, 401], [891, 405], [203, 752], [325, 167, 998], [738, 881, 242], [425, 661, 526], [937], [746, 232, 707, 249], [228], [694, 924], [574, 484, 564, 146, 820], [522, 470], [502, 171], [554, 557], [65, 786, 148, 786, 489], [536, 155, 544, 413], [636, 395, 585, 285], [595, 202], [947, 620, 958, 51, 328], [994, 600, 985, 752], [580], [1021, 348, 253], [474], [394, 895, 600, 702], [497, 149, 282, 507], [53, 755, 595, 418, 900], [406, 273, 784, 936, 380], [629], [227, 508], [704, 617, 199], [793, 901, 57, 244, 353], [127, 500, 552], [810, 485, 356], [696, 917, 18, 642, 596], [504, 291, 191, 1014, 454], [264, 210, 269, 567], [824, 895, 19, 901, 815], [296, 70, 1019], [241], [287, 440], [742, 859], [633, 166, 677], [192], [204, 826, 224, 892, 81], [602, 584], [66, 434, 436, 91], [82], [1022], [735, 815], [599, 290, 131, 808], [75, 180], [361, 699, 446], [208, 410, 273, 673], [542, 

In [40]:
# Train with best hidden dimensions for more Epochs

model = create_model(input_dim, best_hidden_dims, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train_history = train_model(model, train_loader, test_loader, criterion, optimizer, 10)

Epoch 1/10, Loss: 0.9320, Accuracy: 0.6439, Validation Loss: 0.8609, Validation Accuracy: 0.6618
Epoch 2/10, Loss: 0.8298, Accuracy: 0.6712, Validation Loss: 0.8085, Validation Accuracy: 0.6777


KeyboardInterrupt: 