In [2]:
from custom_utils import load_and_concatenate_parquet_files
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer, DebertaV2Tokenizer
from peft import get_peft_model, LoraConfig, PeftModelForSequenceClassification
import torch
from datasets import load_dataset
import evaluate
import os

  from .autonotebook import tqdm as notebook_tqdm
2025-01-30 17:03:57.027446: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-30 17:03:57.030660: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-01-30 17:03:57.030670: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [9]:
df  = load_and_concatenate_parquet_files('data/preprocessed_big_training_df')

df = df.rename(columns={'preprocessed_text': 'text'})
df["label_names"] = df["label"].apply(lambda x: "real" if x == 1 else "fake")
display(df)


Unnamed: 0,text,label,label_names
0,donald trump respond mockery fake swedish atta...,1,real
1,tweetwavethis time true pantstweetwave anthony...,1,real
2,rubio prospect trump president worrisome reute...,0,fake
3,trump lifts cyber command status boost cyber d...,0,fake
4,big republican lie economy tear apart minute v...,1,real
...,...,...,...
63116,half briton want stay eu polledinburgh reuters...,0,fake
63117,bill hillary clinton inc sale right pricein sp...,1,real
63118,orlando gunman shoot time autopsy find new yor...,0,fake
63119,lethal gap supreme court handle death penalty ...,0,fake


In [6]:
train,test      = train_test_split(df,test_size=0.3,stratify=df['label'])
test,validation = train_test_split(test,test_size=1/3,stratify=test['label'])
train.shape, test.shape, validation.shape

((44184, 3), (12624, 3), (6313, 3))

In [7]:
dataset = DatasetDict(
    {'train':Dataset.from_pandas(train,preserve_index=False),
     'test':Dataset.from_pandas(test,preserve_index=False),
     'validation': Dataset.from_pandas(validation,preserve_index=False)
     }    
)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 44184
    })
    test: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 12624
    })
    validation: Dataset({
        features: ['text', 'label', 'label_names'],
        num_rows: 6313
    })
})

In [8]:
label2id = {x['label_names']:x['label'] for x in dataset['train']}
id2label = {v:k for k,v in label2id.items()}
label2id, id2label

({'real': 1, 'fake': 0}, {1: 'real', 0: 'fake'})

In [9]:
# Load dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    result = {}
    for metric in [accuracy_metric, f1_metric, precision_metric, recall_metric]:
        result.update(metric.compute(predictions=predictions, references=labels))
    return result

In [10]:
def find_model_file(model_dir):
    """
    Look for a model file in the given directory, with the following priority:
    1. model.safetensors or adapter_model.safetensors in the root directory
    2. model.safetensors or adapter_model.safetensors in the latest checkpoint directory
    Returns None if no model file is found.
    """
    if not os.path.exists(model_dir):
        return None
        
    # First check for model files in root directory
    root_model = os.path.join(model_dir, "model.safetensors")
    root_adapter = os.path.join(model_dir, "adapter_model.safetensors")
    if os.path.exists(root_model):
        print(f"Found final model file: {root_model}")
        return model_dir
    elif os.path.exists(root_adapter):
        print(f"Found final adapter file: {root_adapter}")
        return model_dir
        
    # If not found, look for checkpoint directories
    checkpoint_dirs = sorted([
        d for d in os.listdir(model_dir) 
        if os.path.isdir(os.path.join(model_dir, d)) 
        and d.startswith('checkpoint-')
    ], key=lambda x: int(x.split('-')[-1]), reverse=True)
    
    # Check each checkpoint directory for model files
    for checkpoint_dir in checkpoint_dirs:
        full_path = os.path.join(model_dir, checkpoint_dir)
        if os.path.exists(os.path.join(full_path, "model.safetensors")) or \
           os.path.exists(os.path.join(full_path, "adapter_model.safetensors")):
            print(f"Found checkpoint model file in: {full_path}")
            return full_path

In [14]:
def fine_tune_model(
    model_ckpt, 
    dataset, 
    output_dir, 
    use_peft=True,
    training_batch_size=32, 
    checkpoint=None, 
    epochs=5
):
    print(f"Using Model: {model_ckpt} with device {device}")
    print(f"Training mode: {'PEFT' if use_peft else 'Full model'}")
    print(f"Tokenizing Data")
    
    # Tokenizer and dataset preparation
    if model_ckpt == "microsoft/deberta-v3-base":
        tokenizer = DebertaV2Tokenizer.from_pretrained(model_ckpt, use_fast=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
        
    def tokenize_and_format(batch):
        tokens = tokenizer(batch['text'], padding=True, truncation=True)
        # Convert to PyTorch tensors and move to the correct device
        tokens = {key: torch.tensor(val).to(device) for key, val in tokens.items()}
        tokens['labels'] = torch.tensor(batch['label']).to(device)
        return tokens
        
    tokenized_dataset = dataset.map(tokenize_and_format, batched=True)

    # Config and model
    config = AutoConfig.from_pretrained(model_ckpt, num_labels=2)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)
    
    if use_peft:
        # Set target modules based on model architecture
        if model_ckpt == "distilbert-base-uncased":
            target_modules = ["q_lin", "k_lin", "v_lin"]
        elif model_ckpt == "microsoft/deberta-v3-base":
            target_modules = None
        else:
            target_modules = ["query", "value"]
            
        # PEFT: LoRA configuration
        peft_config = LoraConfig(
            task_type="SEQ_CLS",
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=target_modules
        )
        model = get_peft_model(model, peft_config)
        
        if checkpoint:
            checkpoint_dir = find_model_file(checkpoint)
            if checkpoint_dir:
                print(f"Loading LoRA weights from {checkpoint_dir}")
                from_pretrained_kwargs = {
                    "is_trainable": True,
                    "inference_mode": False
                }
                model = PeftModelForSequenceClassification.from_pretrained(
                    model,
                    checkpoint_dir,
                    **from_pretrained_kwargs
                )
            else:
                print(f"No checkpoint found in {checkpoint}")
    else:
        if checkpoint:
            checkpoint_dir = find_model_file(checkpoint)
            if checkpoint_dir:
                print(f"Loading full model weights from {checkpoint_dir}")
                model = AutoModelForSequenceClassification.from_pretrained(
                    checkpoint_dir,
                    config=config
                )
            else:
                print(f"No checkpoint found in {checkpoint}")

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=training_batch_size,
        per_device_eval_batch_size=training_batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        save_total_limit=4,
        fp16=True,
        logging_steps=50,
        report_to="tensorboard",
        lr_scheduler_type="linear",
        warmup_steps=500,
    )
    
    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        compute_metrics=compute_metrics
    )
    
    print("Starting Training")
    trainer.train(resume_from_checkpoint=checkpoint is not None)
    test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
    print(f"Test Results: {test_results}")
    
    # Save model
    if use_peft:
        model.save_pretrained(output_dir)
    else:
        trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    torch.cuda.empty_cache()
    print(f"Finished training {model_ckpt}. Model saved to {output_dir}")

In [16]:
import os

model_checkpoints = [
    # "bert-base-uncased",
    "distilbert-base-uncased",
    # "roberta-base",
    # "microsoft/deberta-v3-base"
]

# Iterate over models
for model_ckpt in model_checkpoints:
    output_dir = f"models/{model_ckpt.replace('/', '_')}_without_peft"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    fine_tune_model(model_ckpt, dataset, output_dir, training_batch_size=32, epochs=1, use_peft=False, checkpoint="./models/distilbert-base-uncased_without_peft")

# checkpoint="/home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/models/distilbert-base-uncased/checkpoint-3455/rng_state.pth"

Using Model: distilbert-base-uncased with device cuda
Training mode: Full model
Tokenizing Data


Map: 100%|██████████| 44184/44184 [00:10<00:00, 4276.51 examples/s]
Map: 100%|██████████| 12624/12624 [00:02<00:00, 4448.26 examples/s]
Map: 100%|██████████| 6313/6313 [00:01<00:00, 4406.46 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Found final model file: ./models/distilbert-base-uncased_without_peft/model.safetensors
Loading full model weights from ./models/distilbert-base-uncased_without_peft
Starting Training


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)


Epoch,Training Loss,Validation Loss


Test Results: {'eval_loss': 0.021424539387226105, 'eval_accuracy': 0.9965937896070975, 'eval_f1': 0.9962117875077086, 'eval_precision': 0.994547053649956, 'eval_recall': 0.9978821037769149, 'eval_runtime': 37.4306, 'eval_samples_per_second': 337.265, 'eval_steps_per_second': 10.553, 'epoch': 5.0}
Finished training distilbert-base-uncased. Model saved to models/distilbert-base-uncased_without_peft


In [6]:
# Load dataset
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Load the evaluation metric
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    result = {}
    for metric in [accuracy_metric, f1_metric, precision_metric, recall_metric]:
        result.update(metric.compute(predictions=predictions, references=labels))
    return result

# Fine-tuning function
def fine_tune_model(model_ckpt, dataset, output_dir, training_batch_size=32, checkpoint=None, epochs=5):
    print(f"Using Model: {model_ckpt} with device {device}")
    print(f"Tokenizing Data")
    # Tokenizer and dataset preparation
    if model_ckpt == "microsoft/deberta-v3-base":
        tokenizer = DebertaV2Tokenizer.from_pretrained(model_ckpt, use_fast=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
    def tokenize_and_format(batch):
        tokens = tokenizer(batch['text'], padding=True, truncation=True)
        # Convert to PyTorch tensors and move to the correct device
        tokens = {key: torch.tensor(val).to(device) for key, val in tokens.items()}
        tokens['labels'] = torch.tensor(batch['label']).to(device)
        return tokens
    tokenized_dataset = dataset.map(tokenize_and_format, batched=True)

    # Config and model
    config = AutoConfig.from_pretrained(model_ckpt, num_labels=2)  # Adjust num_labels if needed
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)
        
    if model_ckpt == "distilbert-base-uncased":
        target_modules = ["q_lin", "k_lin","v_lin"]
    elif model_ckpt == "microsoft/deberta-v3-base":
        target_modules = None
    else:
        target_modules = ["query", "value"]

    
    # PEFT: LoRA
    peft_config = LoraConfig(
        task_type="SEQ_CLS",
        r=8,  # Smaller rank to reduce file size
        lora_alpha=32,  # Adjust scaling factor
        lora_dropout=0.1,
        target_modules=target_modules
    )
    model = get_peft_model(model, peft_config)
    
    if checkpoint:
        print(f"Loading LoRA weights from {checkpoint}")
        model.load_state_dict(torch.load(checkpoint), strict=False)

    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=epochs,  # Adjust based on needs
        per_device_train_batch_size=training_batch_size,
        per_device_eval_batch_size=training_batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_dir=f"{output_dir}/logs",
        save_total_limit=4,  # Limit checkpoints
        fp16=True,  # Mixed precision for speed
        logging_steps=50,
        report_to="tensorboard",
        lr_scheduler_type="linear",
        warmup_steps=500,
    )
    
    # Trainer setup
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        compute_metrics=compute_metrics
    )
    print("Starting Training")
    # Train
    trainer.train(resume_from_checkpoint=checkpoint != None)
    test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
    print(f"Test Results: {test_results}")
    # Save LoRA-only model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    torch.cuda.empty_cache()
    print(f"Finished training {model_ckpt}. Model saved to {output_dir}")


In [7]:
import os

model_checkpoints = [
    # "bert-base-uncased",
    "distilbert-base-uncased",
    # "roberta-base",
    # "microsoft/deberta-v3-base"
]

# Iterate over models
for model_ckpt in model_checkpoints:
    output_dir = f"models/{model_ckpt.replace('/', '_')}_cpu_test"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    fine_tune_model(model_ckpt, dataset, output_dir, training_batch_size=32, epochs=5)

# checkpoint="/home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/models/distilbert-base-uncased/checkpoint-3455/rng_state.pth"

Using Model: distilbert-base-uncased with device cpu
Tokenizing Data


Map: 100%|██████████| 44184/44184 [00:10<00:00, 4106.36 examples/s]
Map: 100%|██████████| 12624/12624 [00:02<00:00, 4307.86 examples/s]
Map: 100%|██████████| 6313/6313 [00:01<00:00, 4248.42 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Training


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 