# Drought Prediction

## Load Libraries

In [24]:
# Drought Prediction
## Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, ParameterGrid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree, metrics
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, ConfusionMatrixDisplay, classification_report,
                             accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, roc_curve, auc, cohen_kappa_score)
from sklearn.naive_bayes import GaussianNB
import pickle
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NeighbourhoodCleaningRule, NearMiss
# Import PyTorch libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import random_split
from torchviz import make_dot
import random
from torch.utils.tensorboard import SummaryWriter

from scipy.stats import uniform
from sklearn.base import BaseEstimator, ClassifierMixin

In [2]:
# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


## Data Wrangling

In [3]:
# drought_df =  pd.read_csv('data/all_timeseries.csv')

with open('data/Xy_trainTest.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [4]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.to_numpy(), dtype=torch.long)

In [5]:
# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [6]:
# Define the validation split ratio
val_split_ratio = 0.2
val_size = int(len(train_dataset) * val_split_ratio)
train_size = len(train_dataset) - val_size

# Split the dataset
train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)

#### Define Neural Network Models

In [7]:
class DroughtClassifier(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size, dropout_prob=0.5):
        super(DroughtClassifier, self).__init__()
        self.layers = nn.ModuleList()
        
        # Input layer
        self.layers.append(nn.Linear(input_size, hidden_sizes[0]))
        
        # Hidden layers
        for i in range(len(hidden_sizes) - 1):
            self.layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i+1]))
        
        # Output layer
        self.layers.append(nn.Linear(hidden_sizes[-1], output_size))
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout_prob)
        
    def forward(self, x):
        for layer in self.layers[:-1]:
            x = self.dropout(F.relu(layer(x)))
        x = self.layers[-1](x)
        return x

#### Train the Model

In [8]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

In [9]:
# Define a function to get a unique log directory
def get_log_dir(base_dir='runs'):
    current_time = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    log_dir = os.path.join(base_dir, current_time)
    return log_dir

In [14]:
# Training function with early stopping
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=25, patience=5, log_dir=None):
    if log_dir is None:
        log_dir = get_log_dir()
    writer = SummaryWriter(log_dir=log_dir)
    early_stopping = EarlyStopping(patience=patience)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_predictions = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move to GPU
            
            optimizer.zero_grad()  # Zero the parameter gradients
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            correct_predictions += torch.sum(preds == labels).item()
        
        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_accuracy = correct_predictions / len(train_loader.dataset)
        
        # Validate the model
        model.eval()
        val_running_loss = 0.0
        val_correct_predictions = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)  # Move to GPU
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_running_loss += loss.item() * inputs.size(0)
                _, preds = torch.max(outputs, 1)
                val_correct_predictions += torch.sum(preds == labels).item()
        
        val_loss = val_running_loss / len(val_loader.dataset)
        val_accuracy = val_correct_predictions / len(val_loader.dataset)
        
        # Log metrics to TensorBoard
        writer.add_scalar('Loss/train', epoch_loss, epoch)
        writer.add_scalar('Loss/validation', val_loss, epoch)
        writer.add_scalar('Accuracy/train', epoch_accuracy, epoch)
        writer.add_scalar('Accuracy/validation', val_accuracy, epoch)
        writer.add_scalar('Learning_Rate', scheduler.get_last_lr()[0], epoch)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_accuracy:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

        # Update learning rate
        scheduler.step()

        # Check early stopping criteria
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print(f"Early stopping at epoch {epoch+1}")
            break

    print('Training complete')
    writer.close()

In [15]:
# Evaluation function
def evaluate_model(model, test_loader, criterion):
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)  # Move to GPU
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * inputs.size(0)
            
            _, preds = torch.max(outputs, 1)
            correct_predictions += torch.sum(preds == labels).item()
    
    test_loss = running_loss / len(test_loader.dataset)
    accuracy = correct_predictions / len(test_loader.dataset)
    print(f'Test Loss: {test_loss:.4f}, Accuracy: {accuracy:.4f}')

In [27]:
# Define a custom PyTorch classifier for hyperparameter search
class PyTorchClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, hidden_sizes=(512, 128, 64, 32), dropout_prob=0.5, lr=0.001, num_epochs=3, patience=5, log_dir=None):
        self.hidden_sizes = hidden_sizes
        self.dropout_prob = dropout_prob
        self.lr = lr
        self.num_epochs = num_epochs
        self.patience = patience
        self.log_dir = log_dir
        self.model = None

    def fit(self, X, y):
        input_size = X.shape[1]
        output_size = len(np.unique(y))
        self.model = DroughtClassifier(input_size, self.hidden_sizes, output_size, self.dropout_prob).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.005)

        train_tensor = torch.tensor(X, dtype=torch.float32)
        labels_tensor = torch.tensor(y, dtype=torch.long)
        train_dataset = TensorDataset(train_tensor, labels_tensor)
        train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, num_workers=4, pin_memory=True)

        train_model(self.model, train_loader, val_loader, criterion, optimizer, scheduler, self.num_epochs, self.patience, log_dir=self.log_dir)
        return self

    def predict(self, X):
        self.model.eval()
        test_tensor = torch.tensor(X, dtype=torch.float32)
        test_loader = DataLoader(test_tensor, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)
        predictions = []

        with torch.no_grad():
            for inputs in test_loader:
                inputs = inputs.to(device)
                outputs = self.model(inputs)
                _, preds = torch.max(outputs, 1)
                predictions.extend(preds.cpu().numpy())

        return np.array(predictions)

#### Initialize the Base Model, Loss Function, and Optimizer

In [16]:
# Define model parameters
input_size = X_train.shape[1]  # Number of features
hidden_sizes = [512, 128, 64, 32]   # model3.2

output_size = 6                # Number of output classes
dropout_prob = 0.5             # Dropout probability

# Initialize the model
base_model = DroughtClassifier(input_size, hidden_sizes, output_size, dropout_prob).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(base_model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.005)
writer = SummaryWriter()

In [29]:
# Define hyperparameter grid for RandomizedSearchCV
lr_space = [10**(-4 * np.random.uniform(0.5, 1)) for _ in range(5)] #learning rate values between 0.1 (1e-1) and 0.001 (1e-4)
param_grid = {
    'hidden_sizes': [(512, 128, 64, 32), (256, 64, 32), (512, 256, 128, 64)],
    'dropout_prob': [0.3, 0.4, 0.5],
    'lr': lr_space
}

# Initialize ParameterGrid
grid = ParameterGrid(param_grid)

#### ParameterGrid Search

In [32]:
# Iterate over each hyperparameter combination
best_model = None
best_params = None
best_val_accuracy = 0

for params in grid:
    print(f"Training with parameters: {params}")
    log_dir = get_log_dir()

    model = PyTorchClassifier(hidden_sizes=params['hidden_sizes'],dropout_prob=params['dropout_prob'],lr=params['lr'],num_epochs=2,patience=5,log_dir=log_dir)
    
    model.fit(X_train, y_train)

    # Evaluate on validation set
    val_predictions = model.predict(X_test)
    val_accuracy = accuracy_score(y_test, val_predictions)

    print(f"Validation Accuracy: {val_accuracy:.4f}")

    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_model = model
        best_params = params

Training with parameters: {'dropout_prob': 0.3, 'hidden_sizes': (512, 128, 64, 32), 'lr': 0.0037391969594745378}


In [None]:
print(f"Best Hyperparameters: {best_params}")
print(f"Best Validation Accuracy: {best_val_accuracy:.4f}")

# Evaluate the best model using the test set
evaluate_model(best_model.model, test_loader, criterion)

#### Random Search

In [26]:
# Initialize RandomizedSearchCV with the custom PyTorch classifier
torch_classifier = PyTorchClassifier(log_dir=get_log_dir())
random_search = RandomizedSearchCV(torch_classifier, param_distributions=param_grid, n_iter=10, cv=3, random_state=42, verbose=3)

In [23]:
# Fit RandomizedSearchCV to find the best hyperparameters
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Epoch 1/3, Training Loss: 1.4444, Training Accuracy: 0.3817, Validation Loss: 1.3074, Validation Accuracy: 0.4427
Epoch 2/3, Training Loss: 1.3959, Training Accuracy: 0.4021, Validation Loss: 1.2850, Validation Accuracy: 0.4502
Epoch 3/3, Training Loss: 1.3844, Training Accuracy: 0.4075, Validation Loss: 1.2767, Validation Accuracy: 0.4542
Training complete
[CV 1/3] END dropout_prob=0.5, hidden_sizes=(256, 64, 32), lr=0.004292205732785885;, score=0.453 total time= 4.4min
Epoch 1/3, Training Loss: 1.4512, Training Accuracy: 0.3776, Validation Loss: 1.3187, Validation Accuracy: 0.4329


KeyboardInterrupt: 

In [None]:
# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model using the best hyperparameters
best_model = random_search.best_estimator_
evaluate_model(best_model, test_loader, criterion)
