In [1]:
from custom_utils import load_and_concatenate_parquet_files
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig
import torch
from datasets import load_dataset
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df  = load_and_concatenate_parquet_files('data/preprocessed_big_training_df')

df = df.rename(columns={'preprocessed_text': 'text'})
df["label_names"] = df["label"].apply(lambda x: "real" if x == 1 else "fake")
display(df)


Unnamed: 0,text,label,label_names
0,donald trump respond mockery fake swedish atta...,1,real
1,tweetwavethis time true pantstweetwave anthony...,1,real
2,rubio prospect trump president worrisome reute...,0,fake
3,trump lifts cyber command status boost cyber d...,0,fake
4,big republican lie economy tear apart minute v...,1,real
...,...,...,...
63116,half briton want stay eu polledinburgh reuters...,0,fake
63117,bill hillary clinton inc sale right pricein sp...,1,real
63118,orlando gunman shoot time autopsy find new yor...,0,fake
63119,lethal gap supreme court handle death penalty ...,0,fake


In [3]:
train,test      = train_test_split(df,test_size=0.3,stratify=df['label'])
test,validation = train_test_split(test,test_size=1/3,stratify=test['label'])
train.shape, test.shape, validation.shape

((44184, 3), (12624, 3), (6313, 3))

In [4]:
dataset = DatasetDict(
    {'train':Dataset.from_pandas(train,preserve_index=False),
     'test':Dataset.from_pandas(test,preserve_index=False),
     'validation': Dataset.from_pandas(validation,preserve_index=False)
     }    
)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 44184
    })
    test: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 12624
    })
    validation: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 6313
    })
})

In [5]:
label2id = {x['label_names']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}
label2id, id2label

({'real': 1, 'fake': 0}, {1: 'real', 0: 'fake'})

In [6]:
# Load dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Fine-tuning function
def fine_tune_model(model_ckpt, dataset, output_dir, training_batch_size=32):
    print(f"Training {model_ckpt}")
    
    # Tokenizer and dataset preparation
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
    def tokenize_and_format(batch):
        tokens = tokenizer(batch['text'], padding=True, truncation=True)
        # Convert to PyTorch tensors and move to the correct device
        tokens = {key: torch.tensor(val).to(device) for key, val in tokens.items()}
        tokens['labels'] = torch.tensor(batch['label']).to(device)
        return tokens
    tokenized_dataset = dataset.map(tokenize_and_format, batched=True)

    # Config and model
    config = AutoConfig.from_pretrained(model_ckpt, num_labels=2)  # Adjust num_labels if needed
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)
    
    # PEFT: LoRA
    peft_config = LoraConfig(
        task_type="SEQ_CLS",
        r=8,  # Smaller rank to reduce file size
        lora_alpha=32,  # Adjust scaling factor
        lora_dropout=0.1,
        target_modules=["query", "value"]
    )
    model = get_peft_model(model, peft_config)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,  # Adjust based on needs
        per_device_train_batch_size=training_batch_size,
        per_device_eval_batch_size=training_batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        save_total_limit=4,  # Limit checkpoints
        fp16=True,  # Mixed precision for speed
        remove_unused_columns=False  # Keeps tokenized data structure intact
    )
    
    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"]
    )
    
    # Train
    trainer.train()
    test_results = trainer.evaluate(eval_dataset=dataset["test"])
    print(f"Test Results: {test_results}")
    # Save LoRA-only model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Finished training {model_ckpt}. Model saved to {output_dir}")


In [7]:
import os

model_checkpoints = [
    "bert-base-uncased",
    "distilbert-base-uncased",
    "roberta-base",
    "microsoft/deberta-base"
]

# Iterate over models
for model_ckpt in model_checkpoints:
    output_dir = f"models/{model_ckpt.replace('/', '_')}"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    fine_tune_model(model_ckpt, dataset, output_dir)

Training bert-base-uncased


Map: 100%|██████████| 44184/44184 [00:13<00:00, 3216.55 examples/s]
Map: 100%|██████████| 12624/12624 [00:03<00:00, 3485.95 examples/s]
Map: 100%|██████████| 6313/6313 [00:01<00:00, 3476.62 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 