# Training Infrastructure

## Loading data into a dataset dictionary

In [3]:
import pandas as pd
from datasets import Dataset, DatasetDict

train = pd.read_parquet("/home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/data/model_training/train_df.parquet")
valid = pd.read_parquet("/home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/data/model_training/valid_df.parquet")
test = pd.read_parquet("/home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/data/model_training/test_df.parquet")

dataset = DatasetDict(
    {'train':Dataset.from_pandas(train,preserve_index=False),
     'validation': Dataset.from_pandas(valid,preserve_index=False),
     'test':Dataset.from_pandas(test,preserve_index=False)
     }    
)

dataset

DatasetDict({
    train: Dataset({
        features: ['preprocessed_text', 'label', 'label_names'],
        num_rows: 44153
    })
    validation: Dataset({
        features: ['preprocessed_text', 'label', 'label_names'],
        num_rows: 12616
    })
    test: Dataset({
        features: ['preprocessed_text', 'label', 'label_names'],
        num_rows: 6308
    })
})

# Model Training

## Transformer Model Training

### Method for evaluating performance while training

In [4]:
import torch
import evaluate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(pred):
    logits, labels = pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    result = {}
    for metric in [accuracy_metric, f1_metric, precision_metric, recall_metric]:
        result.update(metric.compute(predictions=predictions, references=labels))
    return result

2025-02-02 23:51:48.144052: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-02 23:51:48.430439: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2025-02-02 23:51:48.430453: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
import glob
import os
import pandas as pd

def load_dataset(path):
    parquet_files = glob.glob(f"{path}/*.parquet")
    dataframes = {file.split('/')[-1].replace('.parquet', ''): pd.read_parquet(file) for file in parquet_files}
    
    dataset = DatasetDict(
        {name: Dataset.from_pandas(df, preserve_index=False) for name, df in dataframes.items()}
    )
    return dataset

def save_dataset_as_parquet(dataset_dict, folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    for split, dataset in dataset_dict.items():
        file_path = os.path.join(folder_path, f"{split}.parquet")
        df = dataset.to_pandas()
        df.to_parquet(file_path)

### Tokenizing the Dataset

In [6]:
from transformers import AutoTokenizer, DebertaV2Tokenizer
import os
def tokenize_data(dataset, model_name, model_dir=None, save_and_load=False):
    """"Tokenize the given dataset using the given model tokenizer."""
    if model_name == "microsoft/deberta-v3-base":
        tokenizer = DebertaV2Tokenizer.from_pretrained(model_name, use_fast=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name) 
        
    def tokenize_and_format(batch):
        tokens = tokenizer(batch['preprocessed_text'], padding=True, truncation=True)
        tokens = {key: torch.tensor(val).to(device) for key, val in tokens.items()}
        tokens['labels'] = torch.tensor(batch['label']).to(device)
        return tokens
    if save_and_load:
        if model_dir and os.path.exists(f"{model_dir}/tokenized_dataset"):
            print(f"Loading tokenized dataset from {model_dir}")
            tokenized_dataset = load_dataset(f"{model_dir}/tokenized_dataset")
        else:
            print(f"Tokenizing Data")
            tokenized_dataset = dataset.map(tokenize_and_format, batched=True)
            print(f"Saving tokenized dataset to {model_dir}")
            save_dataset_as_parquet(tokenized_dataset, f"{model_dir}/tokenized_dataset")
    else:
        print(f"Tokenizing Data")
        tokenized_dataset = dataset.map(tokenize_and_format, batched=True)

    return tokenized_dataset, tokenizer

### Getting Model Path

In [7]:
import shutil
import os
from pathlib import Path
def get_model_dir(model_name, use_peft, from_checkpoint):
    
    root_path = Path().resolve().parent
    
    if use_peft:
        model_dir = f"{root_path}/models/with_peft/{model_name}"
    else:
        model_dir = f"{root_path}/models/without_peft/{model_name}"
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
        
        if from_checkpoint:
            print(f"WARNING: Cannot continue training from checkpoint as the model directory is empty. Starting from scratch.")
            from_checkpoint = False
        
    elif not from_checkpoint:
    
        input("WARNING: The model directory already exists. As 'from_checkpoint' is set to 'False' the content will be overwritten. Press Enter to continue or Ctrl+C to cancel.")
        # Overwrite the existing directory
        for filename in os.listdir(model_dir):
            file_path = os.path.join(model_dir, filename)
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
    
    print(f"Model directory: {model_dir}")
    return model_dir, from_checkpoint

### Looking for Model Checkpoints

In [8]:
import os
def find_model_file(model_dir):
    """
    Look for a model file in the given directory, with the following priority:
    1. model.safetensors or adapter_model.safetensors in the root directory
    2. model.safetensors or adapter_model.safetensors in the latest checkpoint directory
    Returns None if no model file is found.
    """
    checkpoint_dirs = sorted([
        d for d in os.listdir(model_dir) 
        if os.path.isdir(os.path.join(model_dir, d)) 
        and d.startswith('checkpoint-')
    ], key=lambda x: int(x.split('-')[-1]), reverse=True)
    
    # Check each checkpoint directory for model files
    for checkpoint_dir in checkpoint_dirs:
        full_path = os.path.join(model_dir, checkpoint_dir)
        if os.path.exists(os.path.join(full_path, "model.safetensors")) or \
        os.path.exists(os.path.join(full_path, "adapter_model.safetensors")):
            print(f"Found checkpoint model file in: {full_path}")
            return full_path
        else:
            print(f"No model file found in {full_path}")
            return None

### Loading Model Object without PEFT

In [9]:
from transformers import  AutoModelForSequenceClassification
def load_model_object(model_dir, model, config, from_checkpoint=False):
    
    if not from_checkpoint:
        return model

    checkpoint_dir = find_model_file(model_dir)
         
    if checkpoint_dir is None:
        print(f"No checkpoint found in {from_checkpoint}")
        return model 
    else:
        print(f"Loading full model weights from {checkpoint_dir}")
        model = AutoModelForSequenceClassification.from_pretrained(
            checkpoint_dir,
            config=config
        )
        return model

### Setup of PEFT Model

In [10]:
from peft import get_peft_model, LoraConfig, PeftModelForSequenceClassification

def setup_peft(model_name, model, model_dir, from_checkpoint=False):
    if model_name == "distilbert-base-uncased":
        target_modules = ["q_lin", "k_lin", "v_lin"]
    elif model_name == "microsoft/deberta-v3-base":
        target_modules = None
    else:
        target_modules = ["query", "key", "value"]
        
    # PEFT: LoRA configuration
    peft_config = LoraConfig(
        task_type="SEQ_CLS",
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=target_modules
    )
    model = get_peft_model(model, peft_config)
    
    if from_checkpoint:
        checkpoint_dir = find_model_file(model_dir)
        if checkpoint_dir:
            print(f"Loading LoRA weights from {checkpoint_dir}")
            from_pretrained_kwargs = {
                "is_trainable": True,
                "inference_mode": False
            }
            model = PeftModelForSequenceClassification.from_pretrained(
                model,
                checkpoint_dir,
                **from_pretrained_kwargs,
            )
        else:
            print(f"No checkpoint found in {model_dir}")
    return model

### Load Previous Training Config

In [11]:
import json
def get_completed_epochs(model_dir):
    checkpoint_dir = find_model_file(model_dir)
    trainer_state_path = os.path.join(checkpoint_dir, "trainer_state.json")
    if os.path.exists(trainer_state_path):
        with open(trainer_state_path, "r") as f:
            trainer_state = json.load(f)
            completed_epochs = trainer_state.get("epoch", 0)
            per_device_train_batch_size = trainer_state.get("train_batch_size", 32)
            print(f"Resuming from checkpoint. Completed epochs: {completed_epochs}, Previous Batch Size: {per_device_train_batch_size}")
        return completed_epochs, per_device_train_batch_size 
    else:
        print("Starting from scratch.")
        return None, None

In [12]:
import pandas as pd
import json
from datetime import datetime
import os
from pathlib import Path
def process_training_logs(data, use_peft, model_name):
    """
    Process training logs into separate training and evaluation dataframes with Parquet storage.
    
    Args:
        data (list): List of dictionaries containing training and evaluation logs
        save_dir (str, optional): Directory to save the processed DataFrames as Parquet files
        
    Returns:
        tuple: (training_df, eval_df, summary_df) containing processed DataFrames
    """
    root_path = Path().resolve().parent
    if use_peft:
        save_dir = f"{root_path}/data/model_evaluation/with_peft/{model_name}"
    else:
        save_dir = f"{root_path}/data/model_evaluation/without_peft/{model_name}"
    # Initialize log containers
    training_logs = []
    eval_logs = []
    summary_logs = []
    
    for entry in data:
        # Process evaluation logs
        if any(key.startswith('eval_') for key in entry.keys()):
            eval_entry = {'epoch': entry.get('epoch'), 'step': entry.get('step')}
            for key, value in entry.items():
                if key.startswith('eval_'):
                    clean_key = key[5:]
                    eval_entry[clean_key] = value
            eval_logs.append(eval_entry)
            
        # Process training summary logs
        elif 'train_loss' in entry:
            summary_entry = {
                'epoch': entry.get('epoch'),
                'step': entry.get('step'),
                'total_flos': entry.get('total_flos'),
                'train_loss': entry.get('train_loss'),
                'train_runtime': entry.get('train_runtime'),
                'train_samples_per_second': entry.get('train_samples_per_second'),
                'train_steps_per_second': entry.get('train_steps_per_second')
            }
            summary_logs.append(summary_entry)
            
        # Process regular training logs
        else:
            training_logs.append(entry)
    
    # Create DataFrames
    training_df = pd.DataFrame(training_logs)
    eval_df = pd.DataFrame(eval_logs)
    summary_df = pd.DataFrame(summary_logs)
    
    # Sort and clean up DataFrames
    if not training_df.empty:
        training_df = training_df.sort_values(['epoch', 'step']).reset_index(drop=True)
        training_df['loss_change'] = training_df['loss'].diff() if 'loss' in training_df.columns else None
        training_df['loss_change_rate'] = (training_df['loss_change'] / training_df['loss'].shift(1)) if 'loss' in training_df.columns else None

    if not eval_df.empty:
        eval_df = eval_df.sort_values(['epoch', 'step']).reset_index(drop=True)
        if 'loss' in eval_df.columns:
            eval_df['loss_change'] = eval_df['loss'].diff()
            eval_df['best_loss_so_far'] = eval_df['loss'].cummin()
        if 'accuracy' in eval_df.columns:
            eval_df['best_accuracy_so_far'] = eval_df['accuracy'].cummax()

    # Save DataFrames if directory is provided
    if save_dir:
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        os.makedirs(save_dir, exist_ok=True)
        
        # Save DataFrames as Parquet with optimal compression
        if not training_df.empty:
            training_df.to_parquet(
                os.path.join(save_dir, f'training_logs.parquet'),
                compression='brotli',  # Typically best compression ratio for ML metrics
                index=False
            )
        if not eval_df.empty:
            eval_df.to_parquet(
                os.path.join(save_dir, f'eval_logs.parquet'),
                compression='brotli',
                index=False
            )
        if not summary_df.empty:
            summary_df.to_parquet(
                os.path.join(save_dir, f'summary_logs.parquet'),
                compression='brotli',
                index=False
            )
            
        # Save minimal training configuration summary
        config_summary = {
            'total_steps': len(training_df) if not training_df.empty else 0,
            'total_epochs': float(training_df['epoch'].max()) if not training_df.empty else 0,
            'eval_frequency': len(eval_df) / len(training_df) if not training_df.empty and not eval_df.empty else 0,
            'final_train_loss': float(training_df['loss'].iloc[-1]) if not training_df.empty and 'loss' in training_df else None,
            'best_eval_loss': float(eval_df['loss'].min()) if not eval_df.empty and 'loss' in eval_df else None,
            'best_eval_accuracy': float(eval_df['accuracy'].max()) if not eval_df.empty and 'accuracy' in eval_df else None,
            'timestamp': timestamp
        }
        
        with open(os.path.join(save_dir, f'training_summary.json'), 'w') as f:
            json.dump(config_summary, f)
    
    return training_df, eval_df, summary_df

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, roc_curve, auc
import torch
from datetime import datetime
import os

def process_model_evaluation(trainer, tokenized_dataset, test_results, model_name, use_peft):
    """
    Process and save detailed evaluation metrics for the model.
    
    Args:
        trainer: HuggingFace trainer instance
        tokenized_dataset: Dictionary containing the dataset splits
        test_results: Dictionary containing initial test results
        model_name: Name of the model being evaluated
        use_peft: Boolean indicating if PEFT was used
        save_dir: Directory to save the evaluation results
        
    Returns:
        dict: Dictionary containing all computed metrics
    """
    
    root_path = Path().resolve().parent
    if use_peft:
        save_dir = f"{root_path}/data/model_evaluation/with_peft/{model_name}"
    else:
        save_dir = f"{root_path}/data/model_evaluation/without_peft/{model_name}"
        
    os.makedirs(save_dir, exist_ok=True)
    
    # Get predictions and labels for the test set
    test_pred = trainer.predict(tokenized_dataset["test"])
    predictions = np.argmax(test_pred.predictions, axis=1)
    labels = test_pred.label_ids
    
    # Calculate probabilities for ROC curve
    probabilities = torch.nn.functional.softmax(torch.tensor(test_pred.predictions), dim=1).numpy()
    
    # Calculate confusion matrix
    cm = confusion_matrix(labels, predictions)
    tn, fp, fn, tp = cm.ravel()
    
    # Calculate ROC curve and AUC
    fpr, tpr, _ = roc_curve(labels, probabilities[:, 1])
    roc_auc = auc(fpr, tpr)
    
    # Compile all metrics
    metrics_dict = {
        'model_name': model_name,
        'use_peft': use_peft,
        'accuracy': test_results['eval_accuracy'],
        'precision': test_results['eval_precision'],
        'recall': test_results['eval_recall'],
        'f1': test_results['eval_f1'],
        'loss': test_results['eval_loss'],
        'roc_auc': roc_auc,
        'true_negatives': int(tn),
        'false_positives': int(fp),
        'false_negatives': int(fn),
        'true_positives': int(tp)
    }
    
    # Create DataFrames for different aspects of evaluation
    main_metrics_df = pd.DataFrame([metrics_dict])
    
    # Create confusion matrix DataFrame
    confusion_df = pd.DataFrame({
        'model_name': [model_name],
        'predicted_negative_actual_negative': [tn],
        'predicted_positive_actual_negative': [fp],
        'predicted_negative_actual_positive': [fn],
        'predicted_positive_actual_positive': [tp]
    })
    
    # Create ROC curve DataFrame
    roc_df = pd.DataFrame({
        'model_name': model_name,
        'false_positive_rate': fpr,
        'true_positive_rate': tpr,
        'auc': roc_auc
    })
    
    # Create predictions DataFrame
    predictions_df = pd.DataFrame({
        'model_name': model_name,
        'true_label': labels,
        'predicted_label': predictions,
        'confidence_negative': probabilities[:, 0],
        'confidence_positive': probabilities[:, 1]
    })
    
    
    main_metrics_df.to_parquet(f'{save_dir}/sklearn_metrics.parquet', compression='brotli', index=False)
    confusion_df.to_parquet(f'{save_dir}/sklearn_confusion.parquet', compression='brotli', index=False)
    roc_df.to_parquet(f'{save_dir}/sklearn_roc.parquet', compression='brotli', index=False)
    predictions_df.to_parquet(f'{save_dir}/sklearn_predictions.parquet', compression='brotli', index=False)
    
    return metrics_dict

### Code for Transformer Model Training

In [14]:
from transformers import AutoConfig, AutoModelForSequenceClassification, TrainingArguments, Trainer
from pprint import pprint
def fine_tune_model(
    model_name, 
    dataset, 
    training_batch_size=32, 
    epochs=5,
    use_peft=True,
    from_checkpoint=True
):
    print(f"Using Model: {model_name} with device {device}")
    print(f"Training mode: {'PEFT' if use_peft else 'Full model'}")
    
    model_dir, from_checkpoint = get_model_dir(model_name, use_peft, from_checkpoint)
    
    tokenized_dataset, tokenizer = tokenize_data(dataset, model_name, model_dir)

    config = AutoConfig.from_pretrained(model_name, num_labels=2)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config).to(device)
    
    if use_peft:
        model = setup_peft(model_name, model, model_dir, from_checkpoint=from_checkpoint)
        
    else:
        model = load_model_object(model_dir, model, config, from_checkpoint=from_checkpoint)

    model = model.to(device)
    
    if from_checkpoint:
        completed_epochs, old_batch_size = get_completed_epochs(model_dir)
        if completed_epochs is not None:
            epochs = completed_epochs + epochs
        if old_batch_size is not None:
            training_batch_size = old_batch_size

    training_args = TrainingArguments(
        output_dir=model_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=training_batch_size,
        per_device_eval_batch_size=training_batch_size,
        learning_rate=2e-5,
        weight_decay=0.01,
        evaluation_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,
        logging_dir=f"{model_dir}/logs",
        save_total_limit=4,
        fp16=True,
        logging_steps=50,
        report_to="tensorboard",
        lr_scheduler_type="linear",
        warmup_steps=500,
        metric_for_best_model="eval_loss",
        load_best_model_at_end=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
    )
    
    print("Starting Training")
    
    trainer.train(resume_from_checkpoint=from_checkpoint)
    test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
    print("Test Results:")
    pprint(f"{test_results}")
    
    process_model_evaluation(
        trainer=trainer,
        tokenized_dataset=tokenized_dataset,
        test_results=test_results,
        model_name=model_name,
        use_peft=use_peft,
    )
    process_training_logs(trainer.state.log_history, use_peft, model_name)
    
    if use_peft:
        model.save_pretrained(model_dir)
    else:
        trainer.save_model(model_dir)
    tokenizer.save_pretrained(model_dir)
    
    torch.cuda.empty_cache()
    print(f"Finished training {model_name}. Model saved to {model_dir}")

In [15]:
from warnings import filterwarnings
filterwarnings("ignore", category=FutureWarning)

possible_models = {"bert": "bert-base-uncased", "distilbert": "distilbert-base-uncased", "roberta": "roberta-base"}

# , "deberta": "microsoft/deberta-v3-base"
# current_model = possible_models["distilbert"]

for current_model in possible_models.values():
    fine_tune_model(current_model, dataset, training_batch_size=32, epochs=8, use_peft=True, from_checkpoint=False)

for current_model in possible_models.values():
    fine_tune_model(current_model, dataset, training_batch_size=32, epochs=8, use_peft=False, from_checkpoint=False)




Using Model: distilbert-base-uncased with device cuda
Training mode: PEFT
Model directory: /home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/models/with_peft/distilbert-base-uncased
Tokenizing Data


Map:   0%|          | 0/44153 [00:00<?, ? examples/s]

Map:   0%|          | 0/12616 [00:00<?, ? examples/s]

Map:   0%|          | 0/6308 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting Training


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
200,0.6803,0.672272,0.56785,0.072158,0.968037,0.037476
400,0.5408,0.481963,0.793358,0.763495,0.784197,0.743857
600,0.3491,0.322739,0.861842,0.842305,0.862676,0.822874
800,0.2906,0.265652,0.886969,0.876301,0.86033,0.892876
1000,0.262,0.245312,0.894023,0.881124,0.886404,0.875906
1200,0.2269,0.234588,0.898779,0.888773,0.87603,0.901891


Test Results:
("{'eval_loss': 0.22801239788532257, 'eval_accuracy': 0.904565630944832, "
 "'eval_f1': 0.8948655256723717, 'eval_precision': 0.8843631342768381, "
 "'eval_recall': 0.9056203605514316, 'eval_runtime': 13.5983, "
 "'eval_samples_per_second': 463.883, 'eval_steps_per_second': 14.561, "
 "'epoch': 1.0}")
Finished training distilbert-base-uncased. Model saved to /home/developing_nacho/fhdw/knowledge_engineering/fakenews_detection/models/with_peft/distilbert-base-uncased


# Training Sklearn Models

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.pipeline import make_pipeline
import joblib
import pandas as pd
from pathlib import Path
from lightgbm import LGBMClassifier
from time import time

tfidf_vectorizer = TfidfVectorizer(max_features=10000)


models = {
    'Logistic_Regression': LogisticRegression(),
    'Passive_Aggressive': PassiveAggressiveClassifier(),
    'Multinomial_NB': MultinomialNB(),
    'K-Nearest_Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(),
    'Decsion_Tree': DecisionTreeClassifier(),
    'Random_Forest': RandomForestClassifier(),
    'Gradient_Boosting': GradientBoostingClassifier(),
    'LightGBM': LGBMClassifier(),

}
evaluation_results = []
root_path = Path().resolve().parent

train  = pd.read_parquet(f"{root_path}/data/model_training/train_df.parquet")
test = pd.read_parquet(f"{root_path}/data/model_training/test_df.parquet")

X_train = train["preprocessed_text"].tolist()
y_train = train["label"].tolist()
X_valid = test["preprocessed_text"].tolist()
y_valid = test["label"].tolist()

for model_name, model in models.items():
    start_time = time()
    print(f'Training {model_name}...')
    pipeline = make_pipeline(tfidf_vectorizer, model)
    pipeline.fit(X_train, y_train)

    joblib.dump(pipeline, f'{root_path}/models/sklearn_models/{model_name}_model.pkl')

    # Evaluate on training data
    train_pred = pipeline.predict(X_train)
    accuracy_train = accuracy_score(y_train, train_pred)
    precision_train = precision_score(y_train, train_pred, average='binary', zero_division=0)
    recall_train = recall_score(y_train, train_pred, average='binary', zero_division=0)
    f1_train = f1_score(y_train, train_pred, average='binary', zero_division=0)
    cm_train = confusion_matrix(y_train, train_pred)
    
    # Evaluate on validation data
    test_pred = pipeline.predict(X_valid)
    accuracy_test = accuracy_score(y_valid, test_pred)
    precision_test = precision_score(y_valid, test_pred, average='binary', zero_division=0)
    recall_test = recall_score(y_valid, test_pred, average='binary', zero_division=0)
    f1_test = f1_score(y_valid, test_pred, average='binary', zero_division=0)
    cm_test = confusion_matrix(y_valid, test_pred)
    
    training_time = time() - start_time
    
    tn, fp, fn, tp = cm_train.ravel()
    evaluation_results.append({
        'Model': model_name,
        'Dataset': 'Train',
        'Accuracy': accuracy_train,
        'Precision': precision_train,
        'Recall': recall_train,
        'F1': f1_train,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'Training_Time': training_time
    })
    
    tn, fp, fn, tp = cm_test.ravel()
    evaluation_results.append({
        'Model': model_name,
        'Dataset': 'Test',
        'Accuracy': accuracy_test,
        'Precision': precision_test,
        'Recall': recall_test,
        'F1': f1_test,
        'tn': tn,
        'fp': fp,
        'fn': fn,
        'tp': tp,
        'Training_Time': training_time
    })
    print(f'{model_name} accuracy: {accuracy_test:.4f}')
    print(f'Precision: {precision_test:.4f}, Recall: {recall_test:.4f}, F1 Score: {f1_test:.4f}')

evaluation_df = pd.DataFrame(evaluation_results)
evaluation_df.to_parquet(f"{root_path}/data/model_evaluation/sklearn_models_evaluation.parquet", index=False)
display(evaluation_df)


Training Logistic_Regression...
Logistic_Regression accuracy: 0.9461
Precision: 0.9430, Recall: 0.9364, F1 Score: 0.9397
Training Passive_Aggressive...
Passive_Aggressive accuracy: 0.9396
Precision: 0.9316, Recall: 0.9339, F1 Score: 0.9327


Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall,F1,tn,fp,fn,tp,Training_Time
0,Logistic_Regression,Train,0.956152,0.950883,0.951364,0.951123,23380,973,963,18837,16.303879
1,Logistic_Regression,Test,0.9461,0.94304,0.936373,0.939695,3319,160,180,2649,16.303879
2,Passive_Aggressive,Train,0.999932,0.999949,0.999899,0.999924,24352,1,2,19798,15.580786
3,Passive_Aggressive,Test,0.939601,0.931594,0.933899,0.932745,3285,194,187,2642,15.580786
