In [None]:
# %%
# from google.colab import drive
# `from datasets import Dataset` is importing the `Dataset` class from the `datasets` module. This class is typically used for handling and working with datasets in a structured manner, such as loading, processing, and manipulating data for machine learning tasks.
# from datasets import Dataset
import torch
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
import matplotlib.pyplot as plt
import argparse

In [None]:
class ToxicDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels=None):
            self.encodings = encodings
            self.labels = labels

        def __len__(self):
            return len(self.encodings['input_ids'])

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            if self.labels is not None:
                item['labels'] = torch.tensor(self.labels[idx])
            return item

In [None]:
def tokenize_function(examples, tokenizer):
        return tokenizer(examples['string'], truncation=True, padding=True, max_length=256)

In [None]:
def main():
    data_dir = "./kaggle_data/"

    # %%
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(device)

    # %%
    # Step 3: Load the Data
    # Load training and validation datasets
    train_x = pd.read_csv(f'{data_dir}/train_x.csv')
    train_y = pd.read_csv(f'{data_dir}/train_y.csv')
    valid_x = pd.read_csv(f'{data_dir}/val_x.csv')
    valid_y = pd.read_csv(f'{data_dir}/val_y.csv')

    # Merge X and Y datasets
    train_data = train_x.copy()
    train_data['y'] = train_y['y']
    valid_data = valid_x.copy()
    valid_data['y'] = valid_y['y']

    # Load test data (text.csv)
    test_data = pd.read_csv(f'{data_dir}/test_x.csv')  # Replace with your test file path

    # %%
    # Step 4: Prepare the Data for Tokenization


    # Load the tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # for BERT large use 'bert-large-uncased'

    # Tokenize training, validation, and test data

    test_data['string'].fillna('Missing', inplace=True)  # Replace NaN with empty strings

    train_encodings = tokenizer(list(train_data['string']), truncation=True, padding=True, max_length=256)
    valid_encodings = tokenizer(list(valid_data['string']), truncation=True, padding=True, max_length=256)
    test_encodings = tokenizer(list(test_data['string']), truncation=True, padding=True, max_length=256)

    # %%
    # Step 5: Prepare Torch Datasets


    # Convert to PyTorch Datasets
    train_dataset = ToxicDataset(train_encodings, train_data['y'].tolist())
    valid_dataset = ToxicDataset(valid_encodings, valid_data['y'].tolist())
    test_dataset = ToxicDataset(test_encodings)

    # %%
    # !pip install transformers[torch]
    # !pip install 'accelerate>=0.26.0

    # %%
    # Step 6: Fine-tune BERT

    # Load the pre-trained BERT model
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device) # for BERT large use 'bert-large-uncased'

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=1,  # change the number for different amount of training data used
        weight_decay=0.01,
        logging_dir='./logs',
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
    )

    # Define Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
    )

    # Train the model
    trainer.train()

    # %%
    # Step 5: Access Training and Validation Metrics
    metrics = trainer.state.log_history  # Retrieve logs for training/validation
    train_loss = [x['loss'] for x in metrics if 'loss' in x]
    eval_loss = [x['eval_loss'] for x in metrics if 'eval_loss' in x]

# %%
# Step 6: Visualize Loss

    epochs = list(range(1, len(train_loss) + 1))

    plt.plot(epochs, train_loss, label='Training Loss')
    plt.plot(epochs, eval_loss, label='Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.legend()
    plt.savefig("./training.png")

    # %%
    # Step 7: Generate Predictions on Test Data
    # Put the model in evaluation mode
    model.eval()

    # Use the Trainer for prediction
    test_predictions = trainer.predict(test_dataset).predictions
    test_predictions = torch.softmax(torch.tensor(test_predictions), dim=1)[:, 1] > 0.5  # Threshold at 0.5
    test_predictions = test_predictions.int().tolist()

    # Add IDs for test data
    test_ids = test_data.index.tolist()

    # %%
    # Step 8: Save Predictions in the Required Format
    pred_df = pd.DataFrame({'ID': test_ids, 'pred': test_predictions})
    pred_df.to_csv('prediction.csv', index=False)
    print("Predictions saved to prediction.csv")

In [None]:
if __name__ == "__main__":
    main()