In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install --upgrade datasets



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import os
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Constants
RANDOM_STATE = 42
K_FOLDS = 5
MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "./bert_results"
METRIC_FILE = "bert_metrics.csv"

In [6]:
os.makedirs(OUTPUT_DIR, exist_ok=True)

def load_data(file_path='/content/drive/MyDrive/ds/data_filtered.csv'):
    """Load data from CSV file"""
    df = pd.read_csv(file_path)
    return df

def tokenize_function(example, tokenizer):
    """Tokenize text data"""
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

def compute_metrics(eval_pred):
    """Compute evaluation metrics"""
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

def train_and_evaluate_fold(train_df, test_df, learning_rate, num_epochs, tokenizer):
    """Train and evaluate model on a single fold"""
    train_dataset = Dataset.from_pandas(train_df)
    test_dataset = Dataset.from_pandas(test_df)

    train_tokenized = train_dataset.map(
        lambda x: tokenize_function(x, tokenizer), batched=True
    )
    test_tokenized = test_dataset.map(
        lambda x: tokenize_function(x, tokenizer), batched=True
    )

    train_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    test_tokenized.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model  = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, num_labels=2
    ).to(device)

    fold_output_dir = os.path.join(OUTPUT_DIR, f"lr_{learning_rate}_epochs_{num_epochs}")
    os.makedirs(fold_output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=fold_output_dir,
        report_to="none",
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir=os.path.join(fold_output_dir, 'logs'),
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        no_cuda=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=test_tokenized,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    eval_results = trainer.evaluate()

    return eval_results

def k_fold_cross_validation(df, k=K_FOLDS):
    """Perform k-fold cross-validation"""
    kf = KFold(n_splits=k, shuffle=True, random_state=RANDOM_STATE)

    learning_rates = [2e-5, 5e-5, 1e-4]
    num_epochs_list = [2, 3, 4]

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    hyperparam_results = []

    best_lr = None
    best_epochs = None
    best_avg_acc = 0

    print("Performing hyperparameter tuning on first fold...")

    for train_index, test_index in kf.split(df):
        train_df = df.iloc[train_index].reset_index(drop=True)
        test_df = df.iloc[test_index].reset_index(drop=True)

        for lr in learning_rates:
            for epochs in num_epochs_list:
                print(f"\nTraining with lr={lr}, epochs={epochs}")
                eval_results = train_and_evaluate_fold(train_df, test_df, lr, epochs, tokenizer)

                accuracy = eval_results.get('eval_accuracy', 0)
                f1 = eval_results.get('eval_f1', 0)
                precision = eval_results.get('eval_precision', 0)
                recall = eval_results.get('eval_recall', 0)

                hyperparam_results.append({
                    'learning_rate': lr,
                    'epochs': epochs,
                    'accuracy': accuracy,
                    'f1': f1,
                    'precision': precision,
                    'recall': recall
                })

                if accuracy > best_avg_acc:
                    best_avg_acc = accuracy
                    best_lr = lr
                    best_epochs = epochs

        break

    plot_hyperparameter_tuning(hyperparam_results)

    print(f"\nRunning {k}-fold cross-validation with best hyperparameters: lr={best_lr}, epochs={best_epochs}")

    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    fold_results = []

    for fold, (train_index, test_index) in enumerate(kf.split(df)):
        print(f"\nTraining fold {fold + 1}/{k}")

        train_df = df.iloc[train_index].reset_index(drop=True)
        test_df = df.iloc[test_index].reset_index(drop=True)

        eval_results = train_and_evaluate_fold(train_df, test_df, best_lr, best_epochs, tokenizer)

        accuracy = eval_results.get('eval_accuracy', 0)
        f1 = eval_results.get('eval_f1', 0)
        precision = eval_results.get('eval_precision', 0)
        recall = eval_results.get('eval_recall', 0)

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

        fold_results.append({
            'fold': fold + 1,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })

        print(f"Fold {fold + 1} results: accuracy={accuracy:.3f}, f1={f1:.3f}, precision={precision:.3f}, recall={recall:.3f}")

    metrics = {
        'Accuracy': (np.mean(accuracies), np.std(accuracies)),
        'Precision': (np.mean(precisions), np.std(precisions)),
        'Recall': (np.mean(recalls), np.std(recalls)),
        'F1 Score': (np.mean(f1_scores), np.std(f1_scores))
    }

    print("\nFinal BERT metrics:")
    for metric, (mean, std) in metrics.items():
        print(f"{metric}: {mean:.3f} ± {std:.3f}")

    metrics_df = pd.DataFrame(fold_results)
    metrics_df.to_csv(os.path.join(OUTPUT_DIR, METRIC_FILE), index=False)

    plot_fold_results(fold_results)

    return metrics, hyperparam_results

def plot_hyperparameter_tuning(results):
    """Plot hyperparameter tuning results"""
    if not results:
        return

    df = pd.DataFrame(results)

    fig, axs = plt.subplots(2, 2, figsize=(15, 12))
    axs = axs.flatten()

    for i, epochs in enumerate(sorted(df['epochs'].unique())):
        subset = df[df['epochs'] == epochs]
        axs[0].plot(subset['learning_rate'], subset['accuracy'], 'o-',
                   label=f'{epochs} epochs', markersize=8)

    axs[0].set_title('Accuracy vs Learning Rate')
    axs[0].set_xlabel('Learning Rate')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xscale('log')
    axs[0].legend()
    axs[0].grid(True, alpha=0.3)

    for i, epochs in enumerate(sorted(df['epochs'].unique())):
        subset = df[df['epochs'] == epochs]
        axs[1].plot(subset['learning_rate'], subset['f1'], 'o-',
                   label=f'{epochs} epochs', markersize=8)

    axs[1].set_title('F1 Score vs Learning Rate')
    axs[1].set_xlabel('Learning Rate')
    axs[1].set_ylabel('F1 Score')
    axs[1].set_xscale('log')
    axs[1].legend()
    axs[1].grid(True, alpha=0.3)


    for i, lr in enumerate(sorted(df['learning_rate'].unique())):
        subset = df[df['learning_rate'] == lr]
        axs[2].plot(subset['epochs'], subset['accuracy'], 'o-',
                   label=f'LR: {lr}', markersize=8)

    axs[2].set_title('Accuracy vs Number of Epochs')
    axs[2].set_xlabel('Number of Epochs')
    axs[2].set_ylabel('Accuracy')
    axs[2].legend()
    axs[2].grid(True, alpha=0.3)


    for i, lr in enumerate(sorted(df['learning_rate'].unique())):
        subset = df[df['learning_rate'] == lr]
        axs[3].plot(subset['epochs'], subset['f1'], 'o-',
                   label=f'LR: {lr}', markersize=8)

    axs[3].set_title('F1 Score vs Number of Epochs')
    axs[3].set_xlabel('Number of Epochs')
    axs[3].set_ylabel('F1 Score')
    axs[3].legend()
    axs[3].grid(True, alpha=0.3)

    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'hyperparameter_tuning.png'))
    plt.close()

def plot_fold_results(fold_results):
    """Plot metrics across folds"""
    df = pd.DataFrame(fold_results)

    metrics = ['accuracy', 'precision', 'recall', 'f1']

    plt.figure(figsize=(12, 8))

    for i, metric in enumerate(metrics):
        plt.plot(df['fold'], df[metric], 'o-', label=metric.capitalize(), markersize=8)

    plt.xlabel('Fold')
    plt.ylabel('Score')
    plt.title('Performance Metrics Across Folds')
    plt.xticks(df['fold'])
    plt.ylim(0, 1)
    plt.grid(True, alpha=0.3)
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(OUTPUT_DIR, 'fold_results.png'))
    plt.close()

def main():
    # Load data
    print("Loading data...")
    df = load_data()
    print("Using device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

    # Perform k-fold cross-validation
    print(f"Performing {K_FOLDS}-fold cross-validation...")
    metrics, hyperparam_results = k_fold_cross_validation(df, k=K_FOLDS)

    # Print final metrics
    print("\nFinal BERT metrics:")
    for metric, (mean, std) in metrics.items():
        print(f"{metric}: {mean:.3f} ± {std:.3f}")

    # Find best hyperparameters
    best_result = max(hyperparam_results, key=lambda x: x['accuracy'])
    print(f"\nBest hyperparameters: lr={best_result['learning_rate']}, epochs={best_result['epochs']}")
    print(f"Best validation accuracy: {best_result['accuracy']:.3f}")

    print("\nDone! Results saved to CSV and PNG files.")

if __name__ == "__main__":
    main()

Loading data...
Using device: Tesla T4
Performing 5-fold cross-validation...
Performing hyperparameter tuning on first fold...

Training with lr=2e-05, epochs=2


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3592,0.351869,0.857706,0.408998,0.696864,0.289436
2,0.1876,0.354024,0.856721,0.512563,0.60835,0.442836



Training with lr=2e-05, epochs=3


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3639,0.357516,0.854013,0.421463,0.646707,0.31259
2,0.1605,0.36936,0.86583,0.506787,0.676329,0.40521
3,0.2042,0.378173,0.85352,0.550943,0.57571,0.52822



Training with lr=2e-05, epochs=4


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3607,0.357322,0.85229,0.407115,0.641745,0.298119
2,0.1782,0.39024,0.866076,0.464567,0.726154,0.341534
3,0.2387,0.391771,0.869276,0.541054,0.671674,0.452967
4,0.1034,0.458491,0.861891,0.564779,0.608696,0.526773



Training with lr=5e-05, epochs=2


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3765,0.355525,0.85549,0.458986,0.63198,0.360347
2,0.1288,0.361546,0.864845,0.538267,0.64257,0.463097



Training with lr=5e-05, epochs=3


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3719,0.362324,0.844412,0.487844,0.554328,0.435601
2,0.1456,0.379806,0.864845,0.517998,0.658482,0.426918
3,0.183,0.411643,0.861645,0.553259,0.613757,0.503618



Training with lr=5e-05, epochs=4


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3458,0.382758,0.839242,0.51305,0.529231,0.497829
2,0.1414,0.381302,0.868291,0.538395,0.666667,0.45152
3,0.2082,0.408279,0.870015,0.542461,0.676026,0.452967
4,0.153,0.522624,0.859921,0.557198,0.602694,0.51809



Training with lr=0.0001, epochs=2


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4668,0.406592,0.839242,0.116373,0.895833,0.062229
2,0.1987,0.393204,0.849828,0.448463,0.59759,0.3589



Training with lr=0.0001, epochs=3


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4778,0.439212,0.838503,0.098901,0.972973,0.052098
2,0.2346,0.432508,0.85229,0.321267,0.735751,0.205499
3,0.3366,0.371362,0.856228,0.471014,0.62954,0.376266



Training with lr=0.0001, epochs=4


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4753,0.439842,0.837272,0.11984,0.75,0.065123
2,0.3035,0.408635,0.842935,0.254673,0.660606,0.157742
3,0.3379,0.369153,0.846873,0.464716,0.573248,0.390738
4,0.2593,0.383181,0.851059,0.471616,0.594714,0.390738



Running 5-fold cross-validation with best hyperparameters: lr=5e-05, epochs=4

Training fold 1/5


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3699,0.364305,0.848104,0.481948,0.574,0.41534
2,0.1562,0.395187,0.868045,0.503704,0.699229,0.393632
3,0.305,0.420738,0.869769,0.514233,0.703518,0.40521
4,0.1168,0.513146,0.865091,0.570533,0.622222,0.526773


Fold 1 results: accuracy=0.870, f1=0.514, precision=0.704, recall=0.405

Training fold 2/5


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3346,0.461472,0.838011,0.167089,1.0,0.09116
2,0.2723,0.343626,0.855736,0.586742,0.599424,0.574586
3,0.2071,0.392132,0.863368,0.601579,0.626308,0.578729
4,0.1403,0.525522,0.863122,0.584454,0.636808,0.540055


Fold 2 results: accuracy=0.863, f1=0.602, precision=0.626, recall=0.579

Training fold 3/5


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3042,0.356894,0.85549,0.372193,0.790909,0.243357
2,0.4351,0.325678,0.864106,0.550489,0.658869,0.472727
3,0.195,0.394245,0.874692,0.577593,0.710204,0.486713
4,0.0537,0.523433,0.871738,0.609738,0.656452,0.569231


Fold 3 results: accuracy=0.875, f1=0.578, precision=0.710, recall=0.487

Training fold 4/5


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3275,0.396768,0.839488,0.243619,0.875,0.141509
2,0.2481,0.348164,0.864106,0.514085,0.741117,0.393531
3,0.1365,0.4273,0.871,0.593168,0.699634,0.514825
4,0.0899,0.532359,0.865583,0.604348,0.653605,0.561995


Fold 4 results: accuracy=0.871, f1=0.593, precision=0.700, recall=0.515

Training fold 5/5


Map:   0%|          | 0/16248 [00:00<?, ? examples/s]

Map:   0%|          | 0/4062 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3538,0.35931,0.856967,0.497839,0.679245,0.392906
2,0.2699,0.378053,0.861645,0.531667,0.683084,0.435198
3,0.1238,0.532535,0.850812,0.561505,0.597843,0.529332
4,0.1166,0.679528,0.84712,0.536221,0.592409,0.489768


Fold 5 results: accuracy=0.862, f1=0.532, precision=0.683, recall=0.435

Final BERT metrics:
Accuracy: 0.868 ± 0.005
Precision: 0.685 ± 0.030
Recall: 0.484 ± 0.061
F1 Score: 0.564 ± 0.035

Final BERT metrics:
Accuracy: 0.868 ± 0.005
Precision: 0.685 ± 0.030
Recall: 0.484 ± 0.061
F1 Score: 0.564 ± 0.035

Best hyperparameters: lr=5e-05, epochs=4
Best validation accuracy: 0.870

Done! Results saved to CSV and PNG files.
