# Drought Prediction

## Load Libraries

In [1]:
# General purpose libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
import pickle

# Scikit-learn libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ParameterGrid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, metrics
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay, classification_report,
                             accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, roc_curve, auc, cohen_kappa_score)
from sklearn.naive_bayes import GaussianNB

# Imbalanced-learn libraries for handling imbalanced datasets
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NeighbourhoodCleaningRule, NearMiss

# PyTorch libraries for deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import random_split
from torchviz import make_dot
import random
from torch.utils.tensorboard import SummaryWriter

# Scipy library for statistical functions
from scipy.stats import uniform
# Base classes for custom estimators in scikit-learn
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
# Check device, use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Data Load and Setup

In [3]:
# drought_df =  pd.read_csv('data/all_timeseries.csv')

# Load training and testing data from a pickle file
with open('data/Xy_trainTest.pkl', 'rb') as f:
    # Unpickle the data into training and testing datasets
    X_train, X_test, y_train, y_test = pickle.load(f)

In [4]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

In [5]:
# Create DataLoaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [6]:
# Define the validation split ratio
val_split_ratio = 0.2
val_size = int(len(train_dataset) * val_split_ratio)
train_size = len(train_dataset) - val_size

# Split the dataset
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

## Define Neural Network Classes/Functions

#### Class: DroughtClassifier

In [7]:
class DroughtClassifier(nn.Module):
    """
    A neural network classifier for drought prediction.

    Args:
        input_size (int): The number of input features.
        hidden_sizes (list of int): A list containing the sizes of the hidden layers.
        output_size (int): The number of output classes.
        dropout_prob (float, optional): The probability of an element to be zeroed in dropout. Default is 0.5.

    Attributes:
        layers (nn.ModuleList): A list of linear layers.
        dropout (nn.Dropout): Dropout layer for regularization.
    """
    def __init__(self, input_size, hidden_sizes, output_size, dropout_prob=0.5):
        super(DroughtClassifier, self).__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
        
        # Hidden layers
        for i in range(len(hidden_sizes) - 1):
            self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        
        # Output layer
        self.layers.append(nn.Linear(hidden_sizes[-1], output_size))
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_prob)

    def forward(self, x):
        """
        Defines the forward pass of the neural network.

        Args:
            x (torch.Tensor): The input tensor.

        Returns:
            torch.Tensor: The output tensor after passing through the network.
        """
        # Apply each layer followed by ReLU activation and dropout, except the last layer
        for layer in self.layers[:-1]:
            x = self.dropout(F.relu(layer(x)))
        # Apply the last layer without activation or dropout
        x = self.layers[-1](x)
        return x

#### Class: EarlyStopping

In [8]:
class EarlyStopping:
    """
    Early stopping to stop the training when the loss does not improve after a given patience.

    Args:
        patience (int, optional): How long to wait after last time validation loss improved. Default is 5.
        delta (float, optional): Minimum change in the monitored quantity to qualify as an improvement. Default is 0.

    Attributes:
        patience (int): How long to wait after last time validation loss improved.
        delta (float): Minimum change in the monitored quantity to qualify as an improvement.
        best_loss (float): Best recorded validation loss.
        counter (int): Counter for how many epochs have passed since the last improvement.
        early_stop (bool): Whether early stopping is triggered.
    """
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        """
        Checks if the validation loss has improved and updates the counter and early stop flag accordingly.

        Args:
            val_loss (float): The current validation loss.

        Returns:
            None
        """
        if val_loss < self.best_loss - self.delta:
            # If the validation loss has improved (by more than delta), reset the counter
            self.best_loss = val_loss
            self.counter = 0
        else:
            # If the validation loss has not improved, increment the counter
            self.counter += 1
            if self.counter >= self.patience:
                # If the counter exceeds the patience, set the early stop flag
                self.early_stop = True


#### Function: get_log_dir

In [9]:
# Define a function to get a unique log directory
def get_log_dir(base_dir='runs'):
    """
    Generates a unique log directory path based on the current date and time.

    Args:
        base_dir (str, optional): The base directory where logs will be saved. Default is 'runs'.

    Returns:
        str: A unique directory path for saving logs.
    """
    # Get the current date and time as a formatted string
    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    # Create a log directory path by combining the base directory and the current time
    log_dir = os.path.join(base_dir, current_time)
    return log_dir

#### Function train_model

In [10]:
# Training function with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=25, patience=5, log_dir=None, hparams=None):
    """
    Trains the model with early stopping and logs metrics to TensorBoard.

    Args:
        model (nn.Module): The neural network model to be trained.
        train_loader (DataLoader): DataLoader for the training dataset.
        val_loader (DataLoader): DataLoader for the validation dataset.
        criterion (nn.Module): The loss function.
        optimizer (torch.optim.Optimizer): The optimizer for training.
        scheduler (torch.optim.lr_scheduler): Learning rate scheduler.
        num_epochs (int, optional): The maximum number of epochs for training. Default is 25.
        patience (int, optional): The number of epochs with no improvement after which training will be stopped. Default is 5.
        log_dir (str, optional): The directory to save TensorBoard logs. If None, a new directory will be created.
        hparams (dict, optional): Hyperparameters to log.

    Returns:
        None
    """
    if log_dir is None:
        log_dir = get_log_dir()                         # Create a unique log directory if not provided
    writer = SummaryWriter(log_dir=log_dir)             # Initialize TensorBoard writer
    early_stopping = EarlyStopping(patience=patience)   # Initialize early stopping
    
    # For each epoch
    for epoch in range(num_epochs):
        model.train()                # Set model to training mode
        running_loss = 0.0
        correct_predictions = 0

        # Training loop
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device) # Move inputs and labels to GPU
            
            optimizer.zero_grad() # Zero the parameter gradients
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)             # Accumulate loss
            _, preds = torch.max(outputs, 1)                         # Get predictions
            correct_predictions += torch.sum(preds == labels).item() # Count correct predictions
        
        epoch_loss = running_loss / len(train_loader.dataset)             # Calculate average loss for this epoch
        epoch_accuracy = correct_predictions / len(train_loader.dataset)  # Calculate accuracy for this epoch
        
        # Validation loop
        model.eval() # Set model to evaluation mode
        val_running_loss = 0.0
        val_correct_predictions = 0
        with torch.no_grad():                                           # Disable gradient computation for validation
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)   # Move inputs and labels to GPU
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item() * inputs.size(0)        # Accumulate validation loss
                _, preds = torch.max(outputs, 1)                        # Get predictions
                val_correct_predictions += torch.sum(preds == labels).item() # Count correct predictions
        
        val_loss = val_running_loss / len(val_loader.dataset)               # Calculate average validation loss
        val_accuracy = val_correct_predictions / len(val_loader.dataset)    # Calculate validation accuracy
        
        # Log metrics to TensorBoard
        writer.add_scalar('Loss/train', epoch_loss, epoch)                      # Log training loss
        writer.add_scalar('Loss/validation', val_loss, epoch)                   # Log validation loss
        writer.add_scalar('Accuracy/train', epoch_accuracy, epoch)              # Log training accuracy
        writer.add_scalar('Accuracy/validation', val_accuracy, epoch)           # Log validation accuracy
        writer.add_scalar('Learning_Rate', scheduler.get_last_lr()[0], epoch)   # Log learning rate
        
        # Print metrics for the current epoch
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

        # Update learning rate
        scheduler.step()

        # Check early stopping criteria
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping at epoch {epoch+1}")
            break

    # Log hyperparameters and final metrics to TensorBoard
    if hparams is not None:
        writer.add_hparams(hparams, {'hparam/accuracy': val_accuracy, 'hparam/loss': val_loss})
    print('Training complete')
    writer.close()

#### Function evaluate_model

In [11]:
# Evaluation function
def evaluate_model(model, test_loader, criterion):
    """
    Evaluates the model on the test dataset and prints the test loss and accuracy.

    Args:
        model (nn.Module): The trained neural network model to be evaluated.
        test_loader (DataLoader): DataLoader for the test dataset.
        criterion (nn.Module): The loss function.

    Returns:
        None
    """
    model.eval() # Set model to evaluation mode
    running_loss = 0.0
    correct_predictions = 0
    
    with torch.no_grad():                                           # Disable gradient computation
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)   # Move inputs and labels to GPU

            outputs = model(inputs)             # Forward pass
            loss = criterion(outputs, labels)   # Compute loss

            running_loss += loss.item() * inputs.size(0)    # Accumulate loss
            
            _, preds = torch.max(outputs, 1)                            # Get predictions
            correct_predictions += torch.sum(preds == labels).item()    # Count correct predictions
    
    test_loss = running_loss / len(test_loader.dataset)         # Calculate average test loss
    accuracy = correct_predictions / len(test_loader.dataset)   # Calculate test accuracy
    
    # Print test metrics
    print(f'Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

#### Class: PyTorchClassifier

In [12]:
# Define a custom PyTorch classifier for hyperparameter search
class PyTorchClassifier(BaseEstimator, ClassifierMixin):
    """
    Custom PyTorch classifier for hyperparameter search with scikit-learn compatibility.

    Args:
        hidden_sizes (tuple): Sizes of hidden layers.
        dropout_prob (float): Dropout probability.
        lr (float): Learning rate.
        num_epochs (int): Number of epochs to train.
        patience (int): Patience for early stopping.
        log_dir (str): Directory to save TensorBoard logs.
    """
    def __init__(self, hidden_sizes=(512, 128, 64, 32), dropout_prob=0.5, lr=0.001, num_epochs=3, patience=5, log_dir=None):
        self.hidden_sizes = hidden_sizes
        self.dropout_prob = dropout_prob
        self.lr = lr
        self.num_epochs = num_epochs
        self.patience = patience
        self.log_dir = log_dir
        self.model = None

    def fit(self, X, y):
        """
        Train the PyTorch model on the given dataset.

        Args:
            X (numpy.ndarray): Training data features.
            y (numpy.ndarray): Training data labels.

        Returns:
            self: Returns an instance of self.
        """
        input_size = X.shape[1]         # Number of input features
        output_size = len(np.unique(y)) # Number of unique classes

        # Initialize the model
        self.model = DroughtClassifier(input_size, self.hidden_sizes, output_size, self.dropout_prob).to(device)

        # Define loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.005)

        # Create DataLoader for training data
        train_tensor = torch.tensor(X, dtype=torch.float32)
        labels_tensor = torch.tensor(y, dtype=torch.long)
        train_dataset = TensorDataset(train_tensor, labels_tensor)
        train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4, pin_memory=True)
        
        # Define hyperparameters for logging
        hparams = {
            'hidden_sizes': str(self.hidden_sizes),
            'dropout_prob': self.dropout_prob,
            'lr': self.lr
        }

        # Train the model
        train_model(self.model, train_loader, val_loader, criterion, optimizer, scheduler, self.num_epochs, self.patience, log_dir=self.log_dir, hparams=hparams)
        return self

    def predict(self, X):
        """
        Predict the labels for the given dataset.

        Args:
            X (numpy.ndarray): Data features.

        Returns:
            numpy.ndarray: Predicted labels.
        """
        self.model.eval()   # Set model to evaluation mode
        test_tensor = torch.tensor(X, dtype=torch.float32)
        test_loader = DataLoader(test_tensor, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)
        predictions = []

        with torch.no_grad():  # Disable gradient computation
            for inputs in test_loader:
                inputs = inputs.to(device)              # Move inputs to GPU
                outputs = self.model(inputs)            # Forward pass
                _, preds = torch.max(outputs, 1)        # Get predictions
                predictions.extend(preds.cpu().numpy()) # Store predictions

        # Return predictions as a numpy array
        return np.array(predictions)

## Initialize the Base Model, Loss Function, and Optimizer

In [13]:
# Define model parameters
input_size = X_train.shape[1]  # Number of features
hidden_sizes = [512, 128, 64, 32]   # model3.2

output_size = 6                # Number of output classes
dropout_prob = 0.5             # Dropout probability

# Initialize the model
base_model = DroughtClassifier(input_size, hidden_sizes, output_size, dropout_prob).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.005)
# writer = SummaryWriter()

In [14]:
# Define hyperparameter grid for RandomizedSearchCV
lr_space = [10**(-4 * np.random.uniform(0.5, 1)) for _ in range(5)] #learning rate values between 0.1 (1e-1) and 0.001 (1e-4)
param_grid = {
    'hidden_sizes': [(512, 128, 64, 32), (256, 64, 32), (512, 256, 128, 64)],
    'dropout_prob': [0.3, 0.4, 0.5],
    'lr': lr_space
}

# Initialize ParameterGrid
grid = ParameterGrid(param_grid)

## ParameterGrid Search

In [15]:
# Ensure y_test is a numpy array
y_test = y_test.to_numpy() if not isinstance(y_test, np.ndarray) else y_test

In [16]:
# Iterate over each hyperparameter combination
best_model = None
best_params = None
best_val_accuracy = 0

for params in grid:
    # Print the current hyperparameter combination being trained
    print(f"Training with parameters: {params}")

    # Generate a unique log directory for each iteration
    log_dir = get_log_dir()

    # Create a PyTorchClassifier model with the current hyperparameters
    model = PyTorchClassifier(hidden_sizes=params['hidden_sizes'],dropout_prob=params['dropout_prob'],lr=params['lr'],num_epochs=25,patience=5,log_dir=log_dir)
    
    # Train the model on the training data
    model.fit(X_train, y_train)

    # Evaluate on validation set
    val_predictions = model.predict(X_test)
    val_accuracy = accuracy_score(y_test, val_predictions)

    # Print the validation accuracy for the current hyperparameters
    print(f"Validation Accuracy: {val_accuracy:.4f}")

    # Update the best model and parameters if the current model performs better
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = model
        best_params = params

Training with parameters: {'dropout_prob': 0.3, 'hidden_sizes': (512, 128, 64, 32), 'lr': 0.009880946584662877}
Epoch 1/25, Training Loss: 1.4385, Training Accuracy: 0.3794, Validation Loss: 1.3593, Validation Accuracy: 0.4048
Epoch 2/25, Training Loss: 1.4317, Training Accuracy: 0.3818, Validation Loss: 1.3535, Validation Accuracy: 0.4053
Epoch 3/25, Training Loss: 1.4362, Training Accuracy: 0.3816, Validation Loss: 1.3496, Validation Accuracy: 0.4124
Epoch 4/25, Training Loss: 1.4374, Training Accuracy: 0.3801, Validation Loss: 1.3833, Validation Accuracy: 0.3773
Epoch 5/25, Training Loss: 1.4405, Training Accuracy: 0.3776, Validation Loss: 1.3436, Validation Accuracy: 0.4067
Epoch 6/25, Training Loss: 1.4419, Training Accuracy: 0.3764, Validation Loss: 1.3515, Validation Accuracy: 0.3570
Epoch 7/25, Training Loss: 1.4499, Training Accuracy: 0.3741, Validation Loss: 1.3754, Validation Accuracy: 0.4056
Epoch 8/25, Training Loss: 1.4572, Training Accuracy: 0.3720, Validation Loss: 1.34

In [None]:
print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation Accuracy: {best_val_accuracy:.4f}")

# Evaluate the best model using the test set
evaluate_model(best_model.model, test_loader, criterion)

## Random Search

In [None]:
# Initialize RandomizedSearchCV with the custom PyTorch classifier
torch_classifier = PyTorchClassifier(log_dir=get_log_dir())
random_search = RandomizedSearchCV(torch_classifier, param_distributions=param_grid, n_iter=10, cv=3, random_state=42, verbose=3)

In [None]:
# Fit RandomizedSearchCV to find the best hyperparameters
random_search.fit(X_train, y_train)

In [None]:
# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model using the best hyperparameters
best_model = random_search.best_estimator_
evaluate_model(best_model, test_loader, criterion)
