### Install the Required Libraries:

In [1]:
! pip install pandas numpy torch optuna tqdm transformers scikit-learn




### Import Libraries:

In [2]:
import pandas as pd
import numpy as np
import torch
import optuna
import random
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup, get_scheduler
from sklearn.model_selection import train_test_split
from torch.optim import AdamW
from sklearn.metrics import f1_score
import time
import datetime


### Configure the device:

In [3]:
# Check if a CUDA-capable GPU is available and set the device accordingly.
# If a GPU is available, it will use "cuda"; otherwise, it defaults to "cpu."
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used (either "cuda" for GPU or "cpu").
print(f"Using device: {device}")


Using device: cuda


### Load and prepare data:

In [4]:
# Load the data
# Read the training and testing datasets from their respective file paths.
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# Fill missing values
# Replace missing values in the 'keyword' column with an empty string.
train['keyword'] = train['keyword'].fillna('')
test['keyword'] = test['keyword'].fillna('')

# Combine 'keyword' and 'text'
# Create a new column 'text_combined' by concatenating 'keyword' and 'text' with a space in between.
train['text_combined'] = train['keyword'] + ' ' + train['text']
test['text_combined'] = test['keyword'] + ' ' + test['text']


### Preprocess texts with the BERT tokenizer:

In [6]:
# Load the BERT tokenizer
# Initialize the tokenizer from the pre-trained BERT model ('bert-base-uncased'),
# specifying that all text will be converted to lowercase.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Define the maximum sequence length
# Sequences longer than this value will be truncated, and shorter ones will be padded.
MAX_LEN = 128

# Tokenize the training data
# Prepare the input IDs and attention masks for the training dataset.
input_ids = []
attention_masks = []

for text in train['text_combined']:
    # Encode each text using the BERT tokenizer
    encoded_dict = tokenizer.encode_plus(
                        text,                      # The input text to tokenize
                        add_special_tokens=True,   # Add '[CLS]' and '[SEP]' tokens
                        max_length=MAX_LEN,        # Pad or truncate to this length
                        padding='max_length',      # Pad sequences to the maximum length
                        truncation=True,           # Truncate sequences longer than MAX_LEN
                        return_attention_mask=True,# Generate the attention mask
                        return_tensors='pt',       # Return PyTorch tensors
                   )
    # Append the generated input IDs and attention masks
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists to tensors
# Stack all input IDs and attention masks into tensors for use in the model.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train['target'].values)  # Convert target labels to a tensor


### Split the data into training and validation sets:

In [8]:
# Split the dataset into training and validation sets
# The input IDs and labels are split into training and validation subsets.
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids,              # The tokenized input IDs
    labels,                 # The target labels
    random_state=42,        # Seed for reproducibility
    test_size=0.1           # 10% of the data will be used for validation
)

# Split the attention masks into training and validation sets
# Only the attention masks are split here, corresponding to the input IDs.
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks,        # The attention masks
    input_ids,              # Dummy variable for alignment; not used here
    random_state=42,        # Seed for reproducibility
    test_size=0.1           # 10% of the masks will be used for validation
)


### Create the DataLoaders:

In [11]:
# Define the batch size for data loading
batch_size = 16

# Create the training dataset
# Combine input IDs, attention masks, and labels into a single dataset.
train_data = TensorDataset(train_inputs, train_masks, train_labels)

# Create a sampler for the training data
# Randomly sample data to shuffle the dataset during training.
train_sampler = RandomSampler(train_data)

# Create a DataLoader for the training dataset
# The DataLoader loads the data in batches of the specified size (16).
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the validation dataset
# Combine validation input IDs, attention masks, and labels into a single dataset.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)

# Create a sequential sampler for the validation data
# Sequential sampling is used to maintain the order of the validation data.
validation_sampler = SequentialSampler(validation_data)

# Create a DataLoader for the validation dataset
# The DataLoader loads the validation data in batches of the specified size (16).
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


### Load the pre-trained BERT model:

In [12]:
# Load the pre-trained BERT model for sequence classification
# Initialize the BERT model ('bert-base-uncased') for a binary classification task.
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',       # The pre-trained BERT model to load
    num_labels=2,              # Number of output labels (binary classification: 0 or 1)
    output_attentions=False,   # Do not return attention weights
    output_hidden_states=False # Do not return hidden states
)

# Move the model to the specified device (GPU or CPU)
model.to(device)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### Configure the optimizer and scheduler:

In [13]:
# Initialize the AdamW optimizer
# This optimizer implements the Adam algorithm with weight decay, as recommended for training BERT.
optimizer = AdamW(
    model.parameters(), # The model's parameters to be optimized
    lr=2e-5,            # Learning rate: a small value for fine-tuning BERT
    eps=1e-8            # Epsilon: prevents division by zero in the Adam algorithm
)

# Define the number of training epochs
epochs = 10  # Number of complete passes through the training dataset

# Calculate the total number of training steps
# This is the number of batches in an epoch multiplied by the number of epochs.
total_steps = len(train_dataloader) * epochs

# Set up a learning rate scheduler
# The scheduler linearly decreases the learning rate over the training steps.
scheduler = get_scheduler(
    "linear",               # Type of scheduler (linear decay of learning rate)
    optimizer=optimizer,    # The optimizer whose learning rate is to be scheduled
    num_warmup_steps=0,     # No warm-up steps; learning rate starts decreasing from the beginning
    num_training_steps=total_steps # Total number of training steps
)


### Hyperparameter search:

In [15]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    # Define the hyperparameters to search
    lr = trial.suggest_float("lr", 1e-5, 5e-5, log=True)  # Learning rate
    eps = trial.suggest_float("eps", 1e-8, 1e-6, log=True)  # Epsilon for AdamW
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])  # Batch sizes
    epochs = trial.suggest_int("epochs", 3, 10)  # Number of epochs

    # Prepare the DataLoaders with the selected batch size
    train_dataloader = DataLoader(
        train_data, sampler=RandomSampler(train_data), batch_size=batch_size
    )
    validation_dataloader = DataLoader(
        validation_data, sampler=SequentialSampler(validation_data), batch_size=batch_size
    )

    # Load the pre-trained BERT model
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2,              # Binary classification
        output_attentions=False,   # Do not return attention weights
        output_hidden_states=False # Do not return hidden states
    )
    model.to(device)  # Move the model to the selected device

    # Configure the optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=lr, eps=eps)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )

    # Train the model
    best_f1 = 0  # Track the best F1 score
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        total_loss = 0

        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}"):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()  # Clear the gradients
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()  # Backpropagate the loss
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
            optimizer.step()  # Update weights
            scheduler.step()  # Update learning rate

        # Validate the model
        model.eval()  # Set the model to evaluation mode
        predictions, true_labels = [], []
        with torch.no_grad():
            for batch in validation_dataloader:
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                outputs = model(b_input_ids, attention_mask=b_input_mask)
                logits = outputs.logits
                predictions.append(logits.detach().cpu().numpy())
                true_labels.append(b_labels.cpu().numpy())

        # Flatten predictions and true labels
        flat_predictions = np.concatenate(predictions, axis=0)
        flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
        flat_true_labels = np.concatenate(true_labels, axis=0)

        # Calculate the F1 score
        f1 = f1_score(flat_true_labels, flat_predictions)

        # Track the best F1 score
        if f1 > best_f1:
            best_f1 = f1

    # Return the best F1 score
    return best_f1


# Define and run the study
study = optuna.create_study(direction="maximize")  # Aim to maximize the F1 score
study.optimize(objective, n_trials=1)  # Run 20 trials to find the best hyperparameters

# Display the best hyperparameters and F1 score
print("\nBest hyperparameters:", study.best_params)
print("Best F1 Score:", study.best_value)


[I 2025-01-06 13:45:50,222] A new study created in memory with name: no-name-db50f922-3550-4332-bc80-8ea7f83beca9
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Training Epoch 1: 100%|██████████| 429/429 [02:34<00:00,  2.79it/s]
Training Epoch 2: 100%|██████████| 429/429 [02:33<00:00,  2.79it/s]
Training Epoch 3: 100%|██████████| 429/429 [02:33<00:00,  2.80it/s]
[I 2025-01-06 13:53:49,301] Trial 0 finished with value: 0.8005997001499251 and parameters: {'lr': 4.465530103671243e-05, 'eps': 4.3427525694836487e-07, 'batch_size': 16, 'epochs': 3}. Best is trial 0 with value: 0.8005997001499251.



Best hyperparameters: {'lr': 4.465530103671243e-05, 'eps': 4.3427525694836487e-07, 'batch_size': 16, 'epochs': 3}
Best F1 Score: 0.8005997001499251


### Train the model:

In [16]:
# Seed for reproducibility
# Set random seeds for consistent results across runs.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Training and validation loop with callbacks
best_f1 = 0  # Track the best F1 score
early_stopping_patience = 3  # Maximum number of epochs to wait for improvement
patience_counter = 0  # Counter for early stopping patience

for epoch_i in range(epochs):
    print(f"\n======== Epoch {epoch_i + 1}/{epochs} ========")

    # Training phase
    print("Training...")
    model.train()
    total_loss = 0

    # Progress bar for batches
    progress_bar = tqdm(train_dataloader, desc="Batch")
    for step, batch in enumerate(progress_bar):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()  # Clear gradients
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss

        total_loss += loss.item()
        loss.backward()  # Backpropagate the loss
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  # Gradient clipping
        optimizer.step()  # Update model weights
        scheduler.step()  # Update learning rate

        progress_bar.set_postfix({"Batch Loss": loss.item()})

    # Calculate average training loss
    avg_train_loss = total_loss / len(train_dataloader)
    print(f"\nAverage training loss: {avg_train_loss:.2f}")

    # Validation phase
    print("\nValidation...")
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in tqdm(validation_dataloader, desc="Validation Batches"):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.cpu().numpy()

            predictions.append(logits)
            true_labels.append(label_ids)

    # Calculate metrics
    flat_predictions = np.concatenate(predictions, axis=0)
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
    flat_true_labels = np.concatenate(true_labels, axis=0)

    f1 = f1_score(flat_true_labels, flat_predictions)
    print(f"F1 Score: {f1:.2f}")

    # Early Stopping based on F1 Score
    if f1 > best_f1:
        best_f1 = f1
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")  # Save the best model
        print("New best model saved.")
    else:
        patience_counter += 1
        print(f"Early stopping patience: {patience_counter}/{early_stopping_patience}")
        if patience_counter >= early_stopping_patience:
            print("Early stopping activated.")
            break

print("\nTraining complete!")



Training...


Batch: 100%|██████████| 429/429 [02:34<00:00,  2.77it/s, Batch Loss=1.38]  



Average training loss: 0.46

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]


F1 Score: 0.77
New best model saved.

Training...


Batch: 100%|██████████| 429/429 [02:34<00:00,  2.78it/s, Batch Loss=0.0397]



Average training loss: 0.34

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]


F1 Score: 0.79
New best model saved.

Training...


Batch: 100%|██████████| 429/429 [02:34<00:00,  2.78it/s, Batch Loss=0.0892]



Average training loss: 0.24

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]


F1 Score: 0.80
New best model saved.

Training...


Batch: 100%|██████████| 429/429 [02:33<00:00,  2.79it/s, Batch Loss=0.0187] 



Average training loss: 0.17

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]


F1 Score: 0.79
Early stopping patience: 1/3

Training...


Batch: 100%|██████████| 429/429 [02:34<00:00,  2.78it/s, Batch Loss=0.00316] 



Average training loss: 0.12

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.35it/s]


F1 Score: 0.80
New best model saved.

Training...


Batch: 100%|██████████| 429/429 [02:33<00:00,  2.79it/s, Batch Loss=0.000935]



Average training loss: 0.09

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]


F1 Score: 0.80
Early stopping patience: 1/3

Training...


Batch: 100%|██████████| 429/429 [02:34<00:00,  2.78it/s, Batch Loss=0.0042]  



Average training loss: 0.07

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]


F1 Score: 0.79
Early stopping patience: 2/3

Training...


Batch: 100%|██████████| 429/429 [02:34<00:00,  2.78it/s, Batch Loss=0.000228]



Average training loss: 0.05

Validation...


Validation Batches: 100%|██████████| 48/48 [00:05<00:00,  8.34it/s]

F1 Score: 0.80
Early stopping patience: 3/3
Early stopping activated.

Training complete!





### Predict on the test set:

In [17]:
print("Predicting on the test set...")

# Tokenize the test data
test_input_ids = []
test_attention_masks = []

for text in test['text_combined']:
    # Encode each text sample using the tokenizer
    encoded_dict = tokenizer.encode_plus(
                        text,                      # Input text to tokenize
                        add_special_tokens=True,   # Add '[CLS]' and '[SEP]' tokens
                        max_length=MAX_LEN,        # Pad or truncate to this length
                        padding='max_length',      # Pad sequences to the maximum length
                        truncation=True,           # Truncate sequences longer than MAX_LEN
                        return_attention_mask=True,# Generate the attention mask
                        return_tensors='pt',       # Return PyTorch tensors
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])

# Convert lists to tensors
# Stack the input IDs and attention masks into tensors for the test set.
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)

# Create the DataLoader for the test set
# Combine the input IDs and attention masks into a single dataset and load it in batches.
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Prediction
model.eval()  # Set the model to evaluation mode
predictions = []

for batch in test_dataloader:
    # Move the batch to the appropriate device (GPU or CPU)
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    # Perform inference without gradient computation
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits  # Retrieve the raw output scores
    logits = logits.detach().cpu().numpy()  # Move logits to the CPU and convert to NumPy
    predictions.append(logits)

# Flatten predictions
# Concatenate all predictions into a single array and take the class with the highest score.
flat_predictions = np.concatenate(predictions, axis=0)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()


Predicting on the test set...


### Create the submission file:

In [18]:
# Create a submission DataFrame
# Combine the test set IDs and the predicted target labels into a single DataFrame.
submission = pd.DataFrame({'id': test['id'], 'target': flat_predictions})

# Save the DataFrame to a CSV file
# The file is named 'submission.csv', and the index is not included in the CSV file.
submission.to_csv('submission.csv', index=False)

# Confirmation message
print("The submission file has been successfully created.")


The submission file has been successfully created.
