In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import pickle
import os
import time
import shutil
import logging
from tqdm.auto import tqdm
from huggingface_hub import snapshot_download

In [5]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


In [6]:
local_model_path = '../bert-base-uncased'

# Verify local model exists
if not os.path.exists(local_model_path) or not os.path.isfile(os.path.join(local_model_path, 'pytorch_model.bin')):
    raise FileNotFoundError(f"Local model directory {local_model_path} is missing or incomplete. Ensure bert-base-uncased is in {local_model_path}.")

In [7]:
# Load preprocessed data
df = pd.read_pickle('../data/preprocessed_scam_data.pkl')
print('Loaded data shape:', df.shape)

Loaded data shape: (545, 20)


In [8]:
# Convert text labels to numeric (scam: 1, legit: 0)
labels = df['label'].map({'scam': 1, 'legit': 0})
print('Label distribution:', labels.value_counts())

Label distribution: label
1    348
0    197
Name: count, dtype: int64


In [9]:
# Custom Dataset for BERT
class ScamDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = input_ids
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [10]:
# Prepare data for BERT
bert_input_ids = np.array(df['bert_input_ids'].tolist())  # Convert to numpy array
X_train, X_test, y_train, y_test = train_test_split(bert_input_ids, labels, test_size=0.2, random_state=42, stratify=labels)

In [11]:
# Create datasets
train_dataset = ScamDataset(X_train, y_train.values)
test_dataset = ScamDataset(X_test, y_test.values)

In [12]:
# Load BERT model from local directory
logger.info(f'Loading BERT model from {local_model_path}...')
try:
    model = BertForSequenceClassification.from_pretrained(
        local_model_path,
        num_labels=2,
        from_tf=False,  # Ensure PyTorch weights
    )
    logger.info('BERT model loaded successfully.')
except Exception as e:
    logger.error(f'Failed to load BERT model: {str(e)}')
    raise

2025-04-22 13:09:09,928 - INFO - Loading BERT model from ../bert-base-uncased...
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-04-22 13:09:10,309 - INFO - BERT model loaded successfully.


In [13]:
# Training arguments
training_args = TrainingArguments(
    output_dir='../outputs/bert_results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='../outputs/logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True
)

In [14]:
# Function to evaluate model
def evaluate_model(y_true, y_pred, y_pred_proba=None):
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred),
        'Recall': recall_score(y_true, y_pred),
        'F1-score': f1_score(y_true, y_pred)
    }
    if y_pred_proba is not None:
        metrics['ROC-AUC'] = roc_auc_score(y_true, y_pred_proba)
    cm = confusion_matrix(y_true, y_pred)
    print('Metrics:', metrics)
    print('Confusion Matrix:\n', cm)
    return metrics

In [15]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [16]:
# Train
print('Fine-tuning BERT...')
trainer.train()

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Fine-tuning BERT...


Epoch,Training Loss,Validation Loss
1,0.7565,0.652868
2,0.513,0.512825
3,0.4126,0.389587


TrainOutput(global_step=165, training_loss=0.5711986729592988, metrics={'train_runtime': 2992.9706, 'train_samples_per_second': 0.437, 'train_steps_per_second': 0.055, 'total_flos': 134433304848000.0, 'train_loss': 0.5711986729592988, 'epoch': 3.0})

In [17]:
# Evaluate
print('Evaluating BERT...')
predictions = trainer.predict(test_dataset)
bert_pred = np.argmax(predictions.predictions, axis=1)
bert_pred_proba = torch.softmax(torch.tensor(predictions.predictions), dim=1)[:, 1].numpy()
bert_metrics = evaluate_model(y_test, bert_pred, bert_pred_proba)

Evaluating BERT...


Metrics: {'Accuracy': 0.8532110091743119, 'Precision': 0.8292682926829268, 'Recall': 0.9714285714285714, 'F1-score': 0.8947368421052632, 'ROC-AUC': 0.8923076923076922}
Confusion Matrix:
 [[25 14]
 [ 2 68]]


In [18]:
# Save model and metrics
trainer.save_model('../models/bert_scam_model')
with open('../metrics/bert_metrics.pkl', 'wb') as f:
    pickle.dump(bert_metrics, f)


print('BERT fine-tuning complete. Model and metrics saved.')

BERT fine-tuning complete. Model and metrics saved.


In [None]:
print('hey')