In [None]:
import torch
import numpy as np
from scipy.sparse import load_npz
import os
from torch.utils.data import Dataset, DataLoader, random_split
import yaml 
from torch import nn
from pprint import pprint
from datetime import datetime
import pickle
import time
from typing import Tuple
from tqdm import tqdm

: 

In [None]:
config = {
    "model": {
        "architecture": "GraphConvModel",
        "input_dim": 4,
        "hidden_dims": [128, 128, 128, 128, 128],
        "output_dim": 32,
        "num_classes": 28,
        "num_layers": 3,
        "dropout": 0.5,
        "activation": "LeakyReLU",
        "normalization": "batch"
    },
    "training": {
        "epochs": 10000,
        "batch_size": 256,
        "num_workers": 4,
        "learning_rate": 0.004,
        "weight_decay": 0.001,
        "optimizer": {
            "type": "Adam",
            "params": {
                "betas": [0.9, 0.999],
                "momentum": 0.9
            }
        },
        "scheduler": {
            "type": "StepLR",
            "params": {
                "step_size": 10,
                "gamma": 0.5
            }
        },
        "early_stopping": {
            "patience": 30
        }
    },
    "dataset": {
        "adj_dir": "data/processed_data/adj_matrix",
        "features_dir": "data/processed_data/adj_matrix",
        "split_ratio": {
            "train": 0.85,
            "val": 0.15,
            "test": 0.0
        },
        "transform": {
            "normalize": True,
            "augment": {
                "rotate": 15,
                "flip": True
            }
        }
    },
    "logging": {
        "log_dir": "logs/",
        "checkpoint_path": "Models/",
        "save_interval": 5,
        "tensorboard": True,
        "verbose": True
    },
    "evaluation": {
        "metrics": ["accuracy", "f1_score", "precision", "recall"],
        "confusion_matrix": True,
        "class_labels": [
            "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
            "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
        ]
    },
    "seed": 42,
    "debug_mode": False,
    "distributed_training": {
        "enabled": False,
        "num_nodes": 1,
        "gpus_per_node": 1
    },
    "custom_params": {
        "attention_heads": 4,
        "kernel_size": 3,
        "pooling_type": "max"
    },
    "resume_training": {
        "checkpoint_file": "",
        "strict": True
    }
}

: 

In [3]:
def load_numpy_obj(file_path: str, extension: str, dtype = torch.float32) -> torch.Tensor:
    """Load a numpy object into a torch tensor
    Args:
        file_path: str, path to the file
        extension: str, file extension to load. Eithere  of .npy or .npz
        dtype: torch.dtype, data type to load the data
    Returns:
        matrix: torch.Tensor, tensor with the data loaded"""
    if extension == ".npy":
        matrix = torch.tensor(np.load(file_path), dtype = dtype)
    elif extension == ".npz":
        matrix = torch.tensor(load_npz(file_path).toarray(), dtype = dtype)
    return matrix



# Collect all paths and labels withouth loading the data
def load_data(features_dir, adj_matrix_dir):
    label_mapping = {
            "A": 0,
            "B": 1,
            "C": 2,
            "D": 3,
            "E": 4,
            "F": 5,
            "G": 6,
            "H": 7,
            "I": 8,
            "J": 9, 
            "K": 10,
            "L": 11,
            "M": 12,
            "N": 13,
            "O": 14,
            "P": 15,
            "Q": 16,
            "R": 17,
            "S": 18,
            "T": 19,
            "U": 20,
            "V": 21,
            "W": 22,
            "X": 23,
            "Y": 24,
            "Z": 25,
            "del": 26,
            "nothing": 27,
            "space": 28
        }
    adj_matrices, features, labels = [], [], []
    loop = tqdm(sorted(os.listdir(features_dir)))
    for label in loop:
            adj_label_dir = os.path.join(adj_matrix_dir, label)
            features_label_dir = os.path.join(features_dir, label)
            if os.path.isdir(adj_label_dir):
                for adj_matrix_file, feature_file in zip(sorted(os.listdir(adj_label_dir)), sorted(os.listdir(features_label_dir))):
                    adj_path = os.path.join(adj_label_dir, adj_matrix_file)
                    feature_path = os.path.join(features_label_dir, feature_file)
                    adj_matrices.append(load_numpy_obj(adj_path, extension = ".npz"))
                    features.append(load_numpy_obj(feature_path, extension = ".npy"))
                    labels.append(label_mapping[label])
    return adj_matrices, features, labels

In [4]:

class GraphDataset(Dataset):
    def __init__(self, adj_matrices, features, labels, transform = None):
        """ Initialize the dataset
        Args:
            adj_matrix_dir: str, path to the directory containing the adjacency matrices
            features_dir: str, path to the directory containing the features
            transform: callable, transformation to apply to the data
        Returns:
            (adj_matrix, feature_matrix, label)"""
        
        super().__init__()
        self.adj_matrices = adj_matrices
        self.features = features
        self.labels = labels
        self.transform = transform



    def __len__(self):
        return len(self.adj_matrices)
    
    
    def __getitem__(self, idx):
        
        # Load the adjacency matrix and feature matrix as float tensors
        adj_matrix, feature_matrix, label = self.adj_matrices[idx], self.features[idx], self.labels[idx]
        
        # Apply transformations if any
        if self.transform:
            adj_matrix, feature_matrix = self.transform(adj_matrix, feature_matrix)
    
        return adj_matrix, feature_matrix, label
    

def load_dataloader(adj_matrices: list, features: list, labels: list, config: dict):
    """ Load the data as dataloaders and split it into train, val and test sets
    Args:
        adj_matrix_dir: str, path to the directory containing the adjacency matrices
        features_dir: str, path to the directory containing the features
        config: dict, configuration dictionary
    Returns:
        train_data: DataLoader, data loader for the training data
        val_data: DataLoader, data loader for the validation data
        test_data: DataLoader, data loader for the test data
    Dataset format: (adj_matrix, feature_matrix, label)"""
    
    dataset = GraphDataset(adj_matrices, features, labels)
    train_split, val_split = config.get("dataset").get("split_ratio").get("train"), \
                             config.get("dataset").get("split_ratio").get("val")
    
    train_data, val_data, test_data = random_split(dataset = dataset,  lengths = [int(train_split*len(dataset)), int(val_split*len(dataset)), 
                                                                       len(dataset) - int(train_split*len(dataset)) - int(val_split*len(dataset))])
    
    info = f"""
{"-*"*10} Data Information {"-*"*10} \n
Total number of samples: {len(dataset)}
Number of training samples: {len(train_data)}
Number of validation samples: {len(val_data)}
Number of test samples: {len(test_data)}
{"-"*50}
"""
    print(info)

    batch_Size = config.get("training").get("batch_size")
    num_workers = config.get("training").get("num_workers", 0)

    train_loader = DataLoader(train_data, batch_size=batch_Size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size = batch_Size, shuffle = False)
    test_loader = DataLoader(test_data, batch_size = batch_Size, shuffle = False)
    return train_loader, val_loader, test_loader

In [None]:
class GCNLayer(nn.Module):
    def __init__(self, input_dim: int, output_dim: int):
        """Initialize the GCN layer.

        Args:
            input_dim (int): Input dimension of the layer.
            output_dim (int): Output dimension of the layer.
        """
        super().__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.activation = nn.LeakyReLU(negative_slope=0.2, inplace=True)

        # Initialize weight matrix W with Xavier initialization
        self.W = nn.Parameter(torch.empty(input_dim, output_dim))
        torch.nn.init.xavier_uniform_(self.W)

        # Residual connection transformation (if input and output dimensions differ)
        self.residual_transform = nn.Linear(input_dim, output_dim) if input_dim != output_dim else None

        # Apply Xavier initialization to residual transform weights
        if self.residual_transform is not None:
            torch.nn.init.xavier_uniform_(self.residual_transform.weight)
            if self.residual_transform.bias is not None:
                torch.nn.init.zeros_(self.residual_transform.bias)

    def calculate_degree_matrix(self, A: torch.Tensor):
        """Calculate the normalized adjacency matrix and D^{-1/2}.

        Args:
            A (torch.Tensor): Adjacency matrix.

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Normalized adjacency matrix and D^{-1/2}.
        """
        A_hat = A + torch.eye(A.size(1), device=A.device)  # Add self-connections
        degrees = A_hat.sum(dim=1)
        D_neg_sqrt = torch.diag_embed(degrees.pow(-0.5))
        return A_hat, D_neg_sqrt

    def forward(self, X: torch.Tensor, A: torch.Tensor):
        """Forward pass of the GCN layer.

        Args:
            X (torch.Tensor): Input feature matrix of shape (N, F).
            A (torch.Tensor): Adjacency matrix of shape (N, N).

        Returns:
            torch.Tensor: Output feature matrix of shape (N, output_dim).
        """
        A_hat, D_neg_sqrt = self.calculate_degree_matrix(A)

        # Graph convolution operation
        support = torch.matmul(D_neg_sqrt, torch.matmul(A_hat, D_neg_sqrt))
        output = torch.matmul(support, torch.matmul(X, self.W))

        # Residual connection
        if self.residual_transform is not None:
            X_transformed = self.residual_transform(X)
        else:
            X_transformed = X

        # Add the residual connection to the output
        output += X_transformed

        # Activation function
        return self.activation(output)


class GCN(nn.Module):
    def __init__(self, input_dim: int, hidden_dims: list, num_landmarks: int, num_classes: int, dropout: float = 0.5):
        """Initialize the GCN model.

        Args:
            input_dim (int): Input dimension of the model.
            hidden_dims (list): List of hidden dimensions.
            num_landmarks (int): Number of landmarks per graph.
            num_classes (int): Number of classes in the dataset.
            dropout (float, optional): Dropout rate. Defaults to 0.5.
        """
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.num_classes = num_classes

        # Extra layers
        self.dropout = nn.Dropout(dropout)
        self.batch_norm = nn.BatchNorm1d(hidden_dims[1])

        # Initialize the GCN layers
        self.layers = nn.ModuleList()
        current_input_dim = input_dim
        for idx, hidden_dim in enumerate(hidden_dims):
            self.layers.append(GCNLayer(current_input_dim, hidden_dim))
            self.layers.append(self.dropout)
            current_input_dim = hidden_dim

        # Insert batch normalization after the second layer
        self.layers.insert(3, self.batch_norm)

        # Output layer
        self.output_layer = nn.Linear(current_input_dim * num_landmarks, num_classes)

        # Apply Xavier initialization to the output layer
        torch.nn.init.xavier_uniform_(self.output_layer.weight)
        if self.output_layer.bias is not None:
            torch.nn.init.zeros_(self.output_layer.bias)

    def forward(self, X: torch.Tensor, A: torch.Tensor) -> torch.Tensor:
        """Forward pass of the GCN model.

        Args:
            X (torch.Tensor): Input feature matrix of shape (batch_size, N, F).
            A (torch.Tensor): Adjacency matrix of shape (N, N).

        Returns:
            torch.Tensor: Output feature matrix of shape (batch_size, num_classes).
        """
        for layer in self.layers:
            # If layer is dropout or batch normalization, don't pass adjacency matrix
            if isinstance(layer, nn.Dropout):
                X = layer(X)
            elif isinstance(layer, nn.BatchNorm1d):
                original_shape = X.shape
                X = layer(X.view(-1, X.size(-1)))  # BatchNorm1d expects (batch_size * N, num_features)
                X = X.view(original_shape)         # Reshape back to original shape
            else:
                X = layer(X, A)

        # Flatten the output and pass it through the output layer
        X = X.view(X.size(0), -1)  # Flatten features
        output = self.output_layer(X)

        return output


In [6]:
def calculate_metrics_multiclass(y_pred, y_true, num_classes):
    """
    Calculate accuracy, precision, recall, and F1 score for multi-class classification.

    Args:
        y_pred (torch.Tensor): Raw logits or probabilities from the model.
        y_true (torch.Tensor): Ground truth class labels (integers 0 to num_classes - 1).
        num_classes (int): Number of classes.

    Returns:
        dict: Metrics - accuracy, precision, recall, F1 score.
    """
    # Convert logits to predicted classes
    y_pred_classes = torch.argmax(y_pred, dim=1)
    
    # Initialize metrics
    precision_per_class = []
    recall_per_class = []
    f1_per_class = []
    
    # Calculate accuracy
    accuracy = (y_pred_classes == y_true).float().mean().item()

    for class_idx in range(num_classes):
        # True Positives, False Positives, False Negatives
        TP = ((y_pred_classes == class_idx) & (y_true == class_idx)).sum().item()
        FP = ((y_pred_classes == class_idx) & (y_true != class_idx)).sum().item()
        FN = ((y_pred_classes != class_idx) & (y_true == class_idx)).sum().item()

        # Precision, Recall, F1 Score for the class
        precision = TP / (TP + FP + 1e-7) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN + 1e-7) if (TP + FN) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall + 1e-7) if (precision + recall) > 0 else 0

        precision_per_class.append(precision)
        recall_per_class.append(recall)
        f1_per_class.append(f1)
    
    # Macro-Averaged Metrics
    macro_precision = sum(precision_per_class) / num_classes
    macro_recall = sum(recall_per_class) / num_classes
    macro_f1 = sum(f1_per_class) / num_classes

    return {
        "accuracy": accuracy,
        "precision": macro_precision,
        "recall": macro_recall,
        "f1_score": macro_f1
    }


def train_model(model: nn.Module, optimiser: torch.optim.Optimizer,
          criterion: torch.nn.modules.loss._Loss, 
          train_loader: torch.utils.data.DataLoader, val_loader: torch.utils.data.DataLoader, 
          device = torch.device("cuda" if torch.cuda.is_available() else "cpu"), 
          config: dict = None, **kwargs) -> Tuple[nn.Module, dict]:
    """ Train a model based on the provided configuration in the config file
    Args:
        model (nn.Module): PyTorch model to train
        optimiser (torch.optim.Optimizer): Optimiser to use for training
        criterion (torch.nn.modules.loss._Loss): Loss function to use for training
        train_loader (torch.utils.data.DataLoader): DataLoader for training data
        val_loader (torch.utils.data.DataLoader): DataLoader for validation data
        config (dict): Configuration dictionary (default: None)"""
    
    # Set up the hyperparameters
    if config:
        NUM_EPOCHS = config.get("training").get("epochs", 100)
        EARLY_STOPPING_PATIENCE = config.get("training").get("early_stopping").get("patience", 15)
        MODEL_SAVE_DIR = config.get("logging").get("checkpoint_path", "Models") 
        LOG_DIR = config.get("logging").get("log_dir", "logs")
    else:
        NUM_EPOCHS = kwargs.get("epochs", 100)  
        EARLY_STOPPING_PATIENCE = kwargs.get("early_stopping_patience", 15)
        MODEL_SAVE_DIR = kwargs.get("model_save_dir", "Models")
        LOG_DIR = kwargs.get("log_dir", "logs")

    WEIGHT_DECAY = optimiser.param_groups[0].get("weight_decay", 0)
    LEARNING_RATE = optimiser.param_groups[0].get("lr", 0.01)
    N_PARAMS = sum(p.numel() for p in model.parameters() if p.requires_grad)

    
    if not os.path.exists(MODEL_SAVE_DIR): os.makedirs(MODEL_SAVE_DIR)
    if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR)

    start_time = time.strftime("%Y-%m-%d, %H:%M:%S")
    session_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    model_save_path = os.path.join(MODEL_SAVE_DIR, f"{session_id}.pth")
    logs_save_path = os.path.join(LOG_DIR, f"{session_id}.pkl")


    training_info = {
        "📅 Training Start Time": start_time,
        "📈 Total Number of Epochs": NUM_EPOCHS,
        "💻 Device Used for Training": device,
        "🆔 Session ID": session_id,

        "🔢 Number of Trainable Parameters": N_PARAMS,
        "🚦 Early Stopping Patience": EARLY_STOPPING_PATIENCE,
        "📉 Weight Decay": WEIGHT_DECAY,
        "📈 Initial Learning Rate": LEARNING_RATE,

        "📂 Model Save Path": model_save_path,
        "📄 Logs Save Path": logs_save_path
    }

    print("\n\n════════════════════════════════════════════")
    print("TRAINING SESSION START")
    print("════════════════════════════════════════════")
    pprint(training_info)
    print("════════════════════════════════════════════")


    train_losses, val_losses = [], [] # Lists to store the training and validation losses
    best_val_loss = float("inf") # Variable to store the best validation loss
    patience = 0 # Variable to store the patience
    
    model.to(device) # Move the model to the device
    loop = tqdm(range(NUM_EPOCHS), desc="Training", position=0, leave=True)
    for epoch in loop:
        model.train()
        epoch_loss = 0.0
        model.to(device)

         # Wrap train_loader with tqdm for a single-line progress bar
        for adj_matrix, features, labels in train_loader:
            # Move tensors to the specified device
            features, adj_matrix, labels = features.to(device), adj_matrix.to(device), labels.to(device)
    
            # Forward pass
            optimiser.zero_grad()  # Zero out the gradients
            output = model(features, adj_matrix)  # Forward pass
            loss = criterion(output, labels)  # Calculate the loss
            loss.backward()  # Backward pass
            optimiser.step()  # Update the weights
    
            epoch_loss += loss.item()


        avg_epoch_loss = epoch_loss / len(train_loader) # Calculate the average loss for the epoch
        train_losses.append(avg_epoch_loss)
        

        # Validation loop
        model.eval()
        val_epoch_loss = 0.0

        with torch.no_grad():
            for adj_matrix, features, labels in val_loader:
                features, adj_matrix, labels = features.to(device), adj_matrix.to(device), labels.to(device)
                output = model(features, adj_matrix)
                loss = criterion(output, labels)
                val_epoch_loss += loss.item()

        avg_val_epoch_loss = val_epoch_loss / len(val_loader)
        val_losses.append(avg_val_epoch_loss)

        loop.set_description(f"Epoch: {epoch+1}/{NUM_EPOCHS}, Train Loss: {avg_epoch_loss:.4f}, Val Loss: {avg_val_epoch_loss:.4f}, Patience: {patience}")

        if avg_val_epoch_loss < best_val_loss:
            best_val_loss = avg_val_epoch_loss
            best_epoch = epoch
            torch.save(model.state_dict(), model_save_path)
            patience = 0
        else:
            patience += 1
            if patience > EARLY_STOPPING_PATIENCE:
                print(f"Early stopping at epoch {epoch}")
                break

    end_time = time.strftime("%Y-%m-%d, %H:%M:%S")
    training_info = {
        "Training Session Summary": {
            "📅 Training Start Time": start_time,
            "⏰ Estimated Training End": end_time,
            "🆔 Session ID": session_id, 
        },
        "File Paths": {
            "📂 Model Saved at": model_save_path,
            "📄 Logs Saved at": logs_save_path
        },
        "Best Performance Metrics": {
            "🔍 Best Validation Loss": round(best_val_loss, 4),
            "🔍 Best Training Loss": round(min(train_losses), 4),
            "🏆 Best Epoch": best_epoch
        }
    }
        
    print("\n\n════════════════════════════════════════════")
    print("TRAINING SESSION END")
    print("════════════════════════════════════════════")
    print(training_info)
    print("════════════════════════════════════════════")


    results = {
        "description": training_info,
        "train_losses": train_losses,
        "val_losses": val_losses,
        "best_epoch": best_epoch,
        "best_val_loss": best_val_loss, 
        "model_path": model_save_path, 
    }

    # Save the results dictionary in logs 
    with open(logs_save_path, "wb") as file:
        pickle.dump(results, file)

    
    return model, results

In [None]:
# Setup the data loaders
BASE_ADJ_MATRIX_PATH = config.get("dataset").get("adj_dir", "/kaggle/input/graph-data/processed_data/adj_matrix")
BASE_FEATURES_PATH = config.get("dataset").get("features_dir", "/kaggle/input/graph-data/processed_data/extended_features")

adj_matrices, features, labels = load_data(features_dir = BASE_FEATURES_PATH, 
                                          adj_matrix_dir = BASE_ADJ_MATRIX_PATH)


train_loader, val_loader, test_loader = load_dataloader(adj_matrices, features, labels, config)
A, X, labels = next(iter(train_loader))

FileNotFoundError: [Errno 2] No such file or directory: 'data/processed_data/adj_matrix'

In [11]:
# Setup the model
INPUT_DIM = config.get("model").get("input_dim", 4)
HIDDEN_DIMS = config.get("model").get("hidden_dims", [64, 128, 32])
NUM_CLASSES = config.get("model").get("num_classes", 28)
NUM_LANDMARKS = A.shape[1]

LEARNING_RATE = config.get("training").get("learning_rate", 4e-2)
WEIGHT_DECAY = config.get("training").get("weight_decay", 0.001)

model = GCN(input_dim=INPUT_DIM, hidden_dims=HIDDEN_DIMS, num_classes=NUM_CLASSES, num_landmarks = 21)

# Setup the optimizer
optimizer_name = config.get("training").get("optimizer", "optimiser").get("type", "Adam")
if optimizer_name not in ["Adam", "SGD"]:
    raise ValueError(f"Optimizer {optimizer_name} not supported. Supported optimizers are Adam and SGD")

if optimizer_name == "Adam":
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
elif optimizer_name == "SGD":
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=WEIGHT_DECAY)

# Setup the loss function
loss_fn = nn.NLL

In [13]:
# Train the model
print(f"Hidden Dimensions: {HIDDEN_DIMS}")
model, results = train_model(model=model, 
                           optimiser=optimizer, 
                           criterion=loss_fn, 
                           train_loader=train_loader, 
                           val_loader=val_loader, 
                           config=config, 
                            device = "cpu")

Hidden Dimensions: [128, 128, 128, 128, 128]


════════════════════════════════════════════
TRAINING SESSION START
════════════════════════════════════════════
{'🆔 Session ID': '20241130_180434',
 '💻 Device Used for Training': 'cpu',
 '📂 Model Save Path': 'Models/20241130_180434.pth',
 '📄 Logs Save Path': 'logs/20241130_180434.pkl',
 '📅 Training Start Time': '2024-11-30, 18:04:34',
 '📈 Initial Learning Rate': 0.004,
 '📈 Total Number of Epochs': 10000,
 '📉 Weight Decay': 0.001,
 '🔢 Number of Trainable Parameters': 142236,
 '🚦 Early Stopping Patience': 30}
════════════════════════════════════════════


Training:   0%|          | 0/10000 [00:00<?, ?it/s]


IndexError: Target 28 is out of bounds.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for adj_matrix, features, labels in train_loader:
            # Move tensors to the specified device
            features, adj_matrix, labels = features.to(device), adj_matrix.to(device), labels.to(device)
    
            # Forward pass
            # optimiser.zero_grad()  # Zero out the gradients
            output = model(features, adj_matrix)  # Forward pass
            break

In [None]:
adj_matrix.shape