In [None]:
# --- Simple Neural Network Model Definition ---
class SimpleTextNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim_lstm, num_numerical_features, hidden_dim_fc, output_dim, pad_idx):
        super(SimpleTextNN, self).__init__()
        # Embedding layer for text
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        # LSTM layer to process sequential text data
        self.lstm = nn.LSTM(embedding_dim, hidden_dim_lstm, batch_first=True, num_layers=1, bidirectional=True) # Using bidirectional LSTM
        
        # Calculate the input size for the first fully connected layer
        # LSTM output is (batch_size, seq_len, 2 * hidden_dim_lstm) because bidirectional=True
        # We'll take the final hidden state of the LSTM (or an aggregation like mean/max pooling)
        # For simplicity, let's use the concatenation of the final forward and backward hidden states
        # The hidden state shape is (num_layers * num_directions, batch_size, hidden_dim_lstm)
        # So, the output from LSTM to be used will be 2 * hidden_dim_lstm
        lstm_output_features = hidden_dim_lstm * 2 # For bidirectional LSTM

        # Fully connected layers
        # Input to fc1 will be the concatenated LSTM output and numerical features
        self.fc1 = nn.Linear(lstm_output_features + num_numerical_features, hidden_dim_fc)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) # Dropout for regularization
        self.fc2 = nn.Linear(hidden_dim_fc, output_dim) # Output layer

    def forward(self, text_data, numerical_data):
        # text_data shape: (batch_size, seq_len)
        # numerical_data shape: (batch_size, num_numerical_features)

        embedded = self.embedding(text_data)
        # embedded shape: (batch_size, seq_len, embedding_dim)

        # Pass embedded text through LSTM
        # outputs shape: (batch_size, seq_len, hidden_dim_lstm * 2)
        # hidden shape: (num_layers * 2, batch_size, hidden_dim_lstm)
        # cell shape: (num_layers * 2, batch_size, hidden_dim_lstm)
        lstm_out, (hidden, cell) = self.lstm(embedded)

        # We can use the final hidden state.
        # Concatenate the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden states
        hidden_combined = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        # hidden_combined shape: (batch_size, hidden_dim_lstm * 2)

        # Concatenate LSTM output (text features) with numerical features
        # Ensure numerical_data is on the same device and has the correct shape
        combined_features = torch.cat((hidden_combined, numerical_data), dim=1)
        # combined_features shape: (batch_size, (hidden_dim_lstm * 2) + num_numerical_features)
        
        # Pass through fully connected layers
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        # logits shape: (batch_size, output_dim)
        return logits

# --- Training Function ---
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, device):
    """
    Trains a PyTorch model.

    Args:
        model (nn.Module): The neural network model to train.
        train_loader (DataLoader): DataLoader for the training set.
        val_loader (DataLoader): DataLoader for the validation set.
        optimizer (optim.Optimizer): The optimizer to use (e.g., Adam).
        criterion (nn.Module): The loss function (e.g., CrossEntropyLoss).
        num_epochs (int): The number of epochs to train for.
        device (torch.device): The device to train on ('cuda' or 'cpu').
    """
    model.to(device) # Move model to the specified device

    for epoch in range(num_epochs):
        model.train() # Set model to training mode
        running_loss = 0.0
        correct_predictions_train = 0
        total_samples_train = 0

        for i, batch in enumerate(train_loader):
            texts = batch['text'].to(device)
            numerical_feats = batch['numerical'].to(device)
            labels = batch['label'].to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(texts, numerical_feats)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * texts.size(0) # Accumulate loss

            # Calculate training accuracy
            _, predicted = torch.max(outputs.data, 1)
            total_samples_train += labels.size(0)
            correct_predictions_train += (predicted == labels).sum().item()

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc_train = correct_predictions_train / total_samples_train

        # Validation phase
        model.eval() # Set model to evaluation mode
        running_val_loss = 0.0
        correct_predictions_val = 0
        total_samples_val = 0
        with torch.no_grad(): # No gradients needed for validation
            for batch in val_loader:
                texts = batch['text'].to(device)
                numerical_feats = batch['numerical'].to(device)
                labels = batch['label'].to(device)

                outputs = model(texts, numerical_feats)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item() * texts.size(0)

                _, predicted = torch.max(outputs.data, 1)
                total_samples_val += labels.size(0)
                correct_predictions_val += (predicted == labels).sum().item()

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_acc_val = correct_predictions_val / total_samples_val

        print(f"Epoch [{epoch+1}/{num_epochs}] | "
              f"Train Loss: {epoch_loss:.4f} | Train Acc: {epoch_acc_train:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_acc_val:.4f}")

    print("Finished Training")


# --- Main Execution ---
if __name__ == '__main__':
    # Determine device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Initialize the model
    # Ensure num_numerical_features matches the number of columns in Numerical_Columns
    # that are actually present and used in the dataset.
    # The TextNumericalDataset class already handles filtering Numerical_Columns.
    # So, we can get the correct count from the dataset's numerical_features shape.
    if len(train_dataset.numerical_features.shape) > 1:
        num_numerical_features = train_dataset.numerical_features.shape[1]
    else: # Handle case where there might be no numerical features or it's 1D
        num_numerical_features = 0 if train_dataset.numerical_features.nelement() == 0 else 1
        if train_dataset.numerical_features.nelement() > 0 and len(train_dataset.numerical_features.shape) == 1:
             # If it's 1D but not empty, it means each sample has 1 numerical feature.
             # This might happen if Numerical_Columns has only one valid column.
             # However, our current setup expects numerical_features to be 2D (batch_size, num_features).
             # If there's truly only one feature, ensure it's shaped as (batch_size, 1) in the Dataset.
             # For now, let's assume if it's 1D and not empty, it's an error in data prep or a single feature.
             # The .values in Dataset creation should make it 2D if pandas df has multiple columns.
             # If Numerical_Columns is empty, .values on an empty list of columns might be tricky.
             # The TextNumericalDataset handles this by using valid_numerical_cols.
             # So, if valid_numerical_cols is empty, numerical_features will be empty.
             pass


    print(f"Number of numerical features being used: {num_numerical_features}")


    pad_idx = vocab[PAD_TOKEN]
    model = SimpleTextNN(
        vocab_size=actual_Vocab_Size,
        embedding_dim=Embedding_Dim,
        hidden_dim_lstm=Hidden_Dim_LSTM,
        num_numerical_features=num_numerical_features,
        hidden_dim_fc=Hidden_Dim_FC,
        output_dim=num_classes,
        pad_idx=pad_idx
    )

    # Define optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=Learning_Rate)
    # CrossEntropyLoss is suitable for multi-class classification
    criterion = nn.CrossEntropyLoss()

    # Start training
    print("Starting training...")
    train_model(model, train_loader, val_loader, optimizer, criterion, Num_Epochs, device)
