### Step 1. prepare Dataset + DataLoader

In [9]:
import torch
from torch.utils.data import Dataset

class Twitter15Dataset(Dataset):
    def __init__(self, graph_data_list):
        """
        Custom dataset for Twitter15 graph data.

        Args:
            graph_data_list (list): List of graph samples. 
                                    Each graph is a dictionary with keys: 
                                    - 'x': node feature matrix (Tensor of shape [seq_len, feature_dim])
                                    - 'edge_index': edge list (not used here)
                                    - 'y': label (int in [0, 3])
        """
        self.graphs = graph_data_list

    def __len__(self):
        # Return the number of samples in the dataset
        return len(self.graphs)

    def __getitem__(self, idx):
        # Return the features and label for a given index
        graph = self.graphs[idx]
        x = graph['x']  # Feature tensor: (sequence_length, feature_dim)
        y = graph['y']  # Label: integer in range [0, 3]
        return x, y


def collate_fn(batch):
    """
    Collate function to handle batches of variable-length sequences.
    Pads each sequence in the batch to the length of the longest sequence.

    Args:
        batch (list of tuples): Each tuple is (x, y), where
                                - x: Tensor of shape (seq_len, feature_dim)
                                - y: label (int)

    Returns:
        padded_xs (Tensor): Padded input features (batch_size, max_seq_len, feature_dim)
        masks (Tensor): Boolean masks indicating valid positions (batch_size, max_seq_len)
        ys (Tensor): Tensor of labels (batch_size,)
    """
    xs, ys = zip(*batch)

    # Determine the maximum sequence length in this batch
    max_len = max(x.shape[0] for x in xs)
    feature_dim = xs[0].shape[1]

    padded_xs = []
    masks = []

    for x in xs:
        seq_len = x.shape[0]
        pad_len = max_len - seq_len

        if pad_len > 0:
            # Pad with zeros at the end of the sequence
            pad = torch.zeros((pad_len, feature_dim), dtype=x.dtype)
            x_padded = torch.cat([x, pad], dim=0)
        else:
            x_padded = x

        # Create a mask where True indicates valid (non-padded) positions
        mask = torch.cat([torch.ones(seq_len), torch.zeros(pad_len)]).bool()

        padded_xs.append(x_padded)
        masks.append(mask)

    # Stack all sequences and masks into tensors
    padded_xs = torch.stack(padded_xs)    # Shape: (batch_size, max_len, feature_dim)
    masks = torch.stack(masks)            # Shape: (batch_size, max_len)
    ys = torch.tensor(ys)                 # Shape: (batch_size,)

    return padded_xs, masks, ys


In [10]:
from sklearn.model_selection import train_test_split
import torch

# Load your cleaned graph data
graph_data_list = torch.load("../processed/twitter15_graph_data_clean.pt", weights_only=False)

# Split into Train / Validation / Test sets in a 7:1.5:1.5 ratio
train_graphs, temp_graphs = train_test_split(graph_data_list, test_size=0.3, random_state=42)
val_graphs, test_graphs = train_test_split(temp_graphs, test_size=0.5, random_state=42)

# Output the number of samples in each split
print(f"Train: {len(train_graphs)}, Val: {len(val_graphs)}, Test: {len(test_graphs)}")


Train: 1043, Val: 223, Test: 224


In [11]:
from torch.utils.data import DataLoader

batch_size = 16

# Build Dataset objects for training, validation, and test sets
train_dataset = Twitter15Dataset(train_graphs)
val_dataset = Twitter15Dataset(val_graphs)
test_dataset = Twitter15Dataset(test_graphs)

# Build DataLoaders for each dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

print("DataLoaders created successfully!")


DataLoaders created successfully!


### Step 2: BiLSTM-CNN

In [12]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class BiLSTM_CNN(nn.Module):
    def __init__(self, hidden_dim, num_classes, kernel_size=3):
        super(BiLSTM_CNN, self).__init__()
        self.bilstm = nn.LSTM(
            input_size=833,               # Input feature dimension
            hidden_size=hidden_dim,       # Hidden size of the LSTM
            bidirectional=True,           # Use bidirectional LSTM
            batch_first=True,             # Input and output tensors are provided as (batch, seq, feature)
            dropout=0.2                   # Added dropout to prevent overfitting
        )
        self.conv1d = nn.Conv1d(
            in_channels=hidden_dim * 2,   # BiLSTM outputs 2 * hidden_dim channels
            out_channels=hidden_dim,      # Output channels after 1D convolution
            kernel_size=kernel_size,
            padding=kernel_size // 2      # Added padding to preserve sequence length
        )
        self.fc = nn.Linear(hidden_dim, num_classes)  # Final classification layer

    def forward(self, x, lengths):
        # Pack the padded sequence for LSTM
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.bilstm(packed_input)
        
        # Unpack sequence
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)

        # Transpose for Conv1d: (batch_size, channels, seq_len)
        lstm_out = lstm_out.permute(0, 2, 1)
        cnn_out = self.conv1d(lstm_out)             # Apply 1D convolution
        cnn_out = cnn_out.max(dim=2)[0]             # Max pooling over time (seq_len dimension)

        output = self.fc(cnn_out)                   # Fully connected layer for classification

        return output, lstm_out                     # Return both prediction and LSTM features


### Step 3. Trainer

In [13]:
import torch
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score

def train_one_epoch(model, train_loader, optimizer, loss_fn, device):
    model.train()  # Set model to training mode
    running_loss = 0.0
    all_preds = []
    all_labels = []

    for x, mask, y in train_loader:
        # Move tensors to the specified device
        x = x.to(device)
        mask = mask.to(device)
        y = y.to(device)

        # Replace NaNs with zeros to prevent propagation issues
        x = torch.nan_to_num(x, nan=0.0)

        # Compute the actual sequence lengths from the mask
        lengths = mask.sum(dim=1).cpu()

        # Forward pass
        logits, _ = model(x, lengths)

        # Compute loss
        loss = loss_fn(logits, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping to avoid exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        # Accumulate loss
        running_loss += loss.item() * x.size(0)

        # Get predicted class indices
        preds = logits.argmax(dim=-1)
        all_preds.extend(preds.detach().cpu().tolist())
        all_labels.extend(y.cpu().tolist())

    # Compute average metrics for the epoch
    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average='macro')

    return epoch_loss, epoch_acc, epoch_f1


def evaluate_one_epoch(model, val_loader, loss_fn, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient calculation for faster inference
        for x, mask, y in val_loader:
            x = x.to(device)
            mask = mask.to(device)
            y = y.to(device)

            x = torch.nan_to_num(x, nan=0.0)  # Replace NaNs with zeros

            lengths = mask.sum(dim=1).cpu()   # Compute actual lengths
            logits, _ = model(x, lengths)     # Forward pass

            # Clamp logits to avoid numerical overflow
            logits = torch.clamp(logits, min=-10, max=10)

            loss = loss_fn(logits, y)
            running_loss += loss.item() * x.size(0)

            preds = logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(y.cpu().tolist())

    # Compute average validation metrics
    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)
    epoch_f1 = f1_score(all_labels, all_preds, average='macro')

    return epoch_loss, epoch_acc, epoch_f1


### Step 4: Runner

In [14]:
# Device configuration: use GPU if available, else fallback to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
batch_size = 32
hidden_dim = 128
num_classes = 4
learning_rate = 5e-5
weight_decay = 1e-2
max_epochs = 1000         # Maximum number of epochs
patience = 10             # Patience for early stopping

# Path to save the best model
save_path = os.path.abspath("../checkpoints/best_model.pt")

# Initialize the model and move it to the selected device
model = BiLSTM_CNN(hidden_dim, num_classes).to(device)

# Define optimizer (AdamW helps with weight decay regularization)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Define loss function with label smoothing to improve generalization
def smooth_cross_entropy(preds, targets, smoothing=0.1):
    confidence = 1.0 - smoothing
    logprobs = F.log_softmax(preds, dim=-1)
    nll_loss = -logprobs.gather(dim=-1, index=targets.unsqueeze(1)).squeeze(1)
    smooth_loss = -logprobs.mean(dim=-1)
    loss = confidence * nll_loss + smoothing * smooth_loss
    return loss.mean()

loss_fn = smooth_cross_entropy

# EarlyStopping utility to stop training when validation performance stops improving
early_stopper = EarlyStopping(patience=patience, verbose=True)

# Training loop
for epoch in range(1, max_epochs + 1):
    # Train for one epoch
    train_loss, train_acc, train_f1 = train_one_epoch(model, train_loader, optimizer, loss_fn, device)
    
    # Evaluate on validation set
    val_loss, val_acc, val_f1 = evaluate_one_epoch(model, val_loader, loss_fn, device)

    # Print performance metrics
    print(f"Epoch {epoch}:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}")
    print(f"  Val   Loss: {val_loss:.4f} | Val   Acc: {val_acc:.4f} | Val   F1: {val_f1:.4f}")

    # Early stopping check
    early_stopper(val_f1, model, save_path)
    if early_stopper.early_stop:
        print("Early stopping triggered!")
        break

print("Training completed.")




Epoch 1:
  Train Loss: 1.3854 | Train Acc: 0.2733 | Train F1: 0.2127
  Val   Loss: 1.3844 | Val   Acc: 0.2825 | Val   F1: 0.2697
Epoch 2:
  Train Loss: 1.3643 | Train Acc: 0.3988 | Train F1: 0.3627
  Val   Loss: 1.3698 | Val   Acc: 0.3767 | Val   F1: 0.3689
Epoch 3:
  Train Loss: 1.3414 | Train Acc: 0.4593 | Train F1: 0.4492
  Val   Loss: 1.3595 | Val   Acc: 0.3587 | Val   F1: 0.3467
EarlyStopping counter: 1 out of 10
Epoch 4:
  Train Loss: 1.3019 | Train Acc: 0.5283 | Train F1: 0.5242
  Val   Loss: 1.3363 | Val   Acc: 0.3991 | Val   F1: 0.3839
Epoch 5:
  Train Loss: 1.2459 | Train Acc: 0.5551 | Train F1: 0.5455
  Val   Loss: 1.3026 | Val   Acc: 0.3901 | Val   F1: 0.3856
Epoch 6:
  Train Loss: 1.1772 | Train Acc: 0.5858 | Train F1: 0.5736
  Val   Loss: 1.2704 | Val   Acc: 0.4529 | Val   F1: 0.4527
Epoch 7:
  Train Loss: 1.1129 | Train Acc: 0.6194 | Train F1: 0.6154
  Val   Loss: 1.2622 | Val   Acc: 0.4484 | Val   F1: 0.4463
EarlyStopping counter: 1 out of 10
Epoch 8:
  Train Loss: 1.05

In [15]:
# Load the best trained model (previously saved)
model.load_state_dict(torch.load(save_path))
model.to(device)
model.eval()  # Set the model to evaluation mode (important for dropout/batchnorm layers)

# Test function
def test_model(model, test_loader, loss_fn, device):
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    all_preds = []  # To store predictions
    all_labels = []  # To store true labels

    with torch.no_grad():  # Disable gradient calculation, as it's not needed during evaluation
        for x, mask, y in test_loader:
            x = x.to(device)
            mask = mask.to(device)
            y = y.to(device)

            # Replace any NaNs in the input with 0.0
            x = torch.nan_to_num(x, nan=0.0)

            # Calculate sequence lengths from the mask
            lengths = mask.sum(dim=1).cpu()

            # Forward pass through the model
            logits, _ = model(x, lengths)
            logits = torch.clamp(logits, min=-10, max=10)  # Clip logits to avoid extreme values

            # Compute the loss
            loss = loss_fn(logits, y)
            running_loss += loss.item() * x.size(0)  # Accumulate the loss

            # Get the predictions (take the index with the highest logit for each sample)
            preds = logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().tolist())  # Append predictions to the list
            all_labels.extend(y.cpu().tolist())  # Append true labels to the list

    # Calculate average test loss
    test_loss = running_loss / len(test_loader.dataset)

    # Calculate accuracy and F1 score for evaluation
    test_acc = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='macro')

    return test_loss, test_acc, test_f1

# Call the test function to evaluate the model
test_loss, test_acc, test_f1 = test_model(model, test_loader, loss_fn, device)

# Print final results
print("=== Final Test Results ===")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")


=== Final Test Results ===
Test Loss: 1.3627
Test Accuracy: 0.5045
Test F1 Score: 0.5043
