### Train Basic Model

In this notebook, we fine-tune a BERT BASE model to detect fake news.

In [1]:
#!pip install accelerate -U
#!pip install datasets
#!pip install transformers
# !pip install evaluate

In [2]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset, DatasetDict
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import numpy as np
from transformers import TrainingArguments
from sklearn.model_selection import KFold
import evaluate

In [3]:
data = pd.read_csv("../Dataset/Training_Emotions.csv")

### Data Preperation

In [4]:
data.columns

Index(['Unnamed: 0', 'id', 'fake', 'tweet', 'valence', 'arousal'], dtype='object')

In [5]:
data = data[['id','tweet','fake']]

In [6]:
data = data.rename(columns={'fake': 'label'})
data = data.rename(columns={'tweet': 'text'})
data['label'] = data['label'].astype(int)
data = data[data['label'].isin([0, 1])]
data = data[:50000]

#### For 5-K-Fold Cross Validation

In [7]:
train_val_df, test_df = train_test_split(data, test_size=0.1, random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

### Define Model

In [8]:
BERT_BASE_UNCASED = "google-bert/bert-base-uncased"

def tokenizer(checkpoint):
    return AutoTokenizer.from_pretrained(checkpoint)

tokenizer_Bert_uncased = tokenizer(BERT_BASE_UNCASED)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [9]:
def tokenize_function(batch, tokenizer):
  return tokenizer(batch["text"].tolist(), truncation=True)

In [10]:
data_collator_bert_uncased = DataCollatorWithPadding(tokenizer=tokenizer_Bert_uncased, return_tensors="pt")

In [12]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="binary")
    recall = recall_score(labels, preds, average="binary")
    return {"precision": precision, "recall": recall, "acc": acc, "f1": f1}

In [13]:
fold_results = []
model_name = 'Detection_Model'
batch_size = 8
num_labels = 2
for fold, (train_index, val_index) in enumerate(kf.split(train_val_df), 1):
    print(f"Training Fold {fold}")

    train_df = train_val_df.iloc[train_index]
    val_df = train_val_df.iloc[val_index]
    train_df = train_df.drop(columns=['id'])
    val_df = val_df.drop(columns=['id'])

    tokenized_datasets_BERT_uncased = {
    "train": Dataset.from_dict(tokenize_function(train_df, tokenizer_Bert_uncased)),
    "val": Dataset.from_dict(tokenize_function(val_df, tokenizer_Bert_uncased))
    }


    tokenized_datasets_BERT_uncased["train"] = tokenized_datasets_BERT_uncased["train"].add_column("labels", train_df["label"].tolist())
    tokenized_datasets_BERT_uncased["val"] = tokenized_datasets_BERT_uncased["val"].add_column("labels", val_df["label"].tolist())


    model_BERT_uncased = AutoModelForSequenceClassification.from_pretrained(BERT_BASE_UNCASED, num_labels=num_labels)
    logging_steps = len(tokenized_datasets_BERT_uncased["train"]) // batch_size

    training_args = TrainingArguments(
        output_dir=f"{model_name}_fold_{fold}",
        num_train_epochs=3,
        learning_rate=4e-5,
        save_strategy="epoch",
        save_total_limit=1,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        evaluation_strategy="epoch",
        disable_tqdm=False,
        logging_steps=logging_steps,
        log_level="info",
        label_smoothing_factor=0.1,
        weight_decay=0.01,
        lr_scheduler_type='cosine',
        warmup_steps=500
    )

    trainer = Trainer(
        model=model_BERT_uncased,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=tokenized_datasets_BERT_uncased["train"],
        eval_dataset=tokenized_datasets_BERT_uncased["val"],
        tokenizer=tokenizer_Bert_uncased,
    )

    trainer.train()

    eval_results = trainer.evaluate()
    fold_results.append(eval_results)
    print(f"Fold {fold} Results:", eval_results)
    print("\n")


avg_results = {key: np.mean([res[key] for res in fold_results]) for key in fold_results[0].keys()}
print("Average Results across all Folds:", avg_results)

Training Fold 1


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
***** Running training *****
  Num examples = 35,989
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13,497
  Number of trainable parameters = 109,483,778


Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.3096,0.233799,0.983624,0.98426,0.983441,0.983942
2,0.2211,0.227946,0.983116,0.991807,0.986997,0.987442
3,0.2047,0.222395,0.990937,0.990082,0.99022,0.990509



***** Running Evaluation *****
  Num examples = 8998
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_1/checkpoint-4499
Configuration saved in Detection_Model_fold_1/checkpoint-4499/config.json
Model weights saved in Detection_Model_fold_1/checkpoint-4499/model.safetensors
tokenizer config file saved in Detection_Model_fold_1/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_1/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8998
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_1/checkpoint-8998
Configuration saved in Detection_Model_fold_1/checkpoint-8998/config.json
Model weights saved in Detection_Model_fold_1/checkpoint-8998/model.safetensors
tokenizer config file saved in Detection_Model_fold_1/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_1/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Detection_Model_fold_1/

Fold 1 Results: {'eval_loss': 0.22239454090595245, 'eval_precision': 0.9909365558912386, 'eval_recall': 0.9900819318671842, 'eval_acc': 0.9902200488997555, 'eval_f1': 0.9905090595340811, 'eval_runtime': 47.6629, 'eval_samples_per_second': 188.784, 'eval_steps_per_second': 23.603, 'epoch': 3.0}


Training Fold 2


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.306,0.243278,0.973979,0.990534,0.98144,0.982187
2,0.2216,0.226769,0.986278,0.989673,0.987553,0.987973
3,0.205,0.220582,0.98886,0.993115,0.990665,0.990983



***** Running Evaluation *****
  Num examples = 8998
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_2/checkpoint-4499
Configuration saved in Detection_Model_fold_2/checkpoint-4499/config.json
Model weights saved in Detection_Model_fold_2/checkpoint-4499/model.safetensors
tokenizer config file saved in Detection_Model_fold_2/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_2/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8998
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_2/checkpoint-8998
Configuration saved in Detection_Model_fold_2/checkpoint-8998/config.json
Model weights saved in Detection_Model_fold_2/checkpoint-8998/model.safetensors
tokenizer config file saved in Detection_Model_fold_2/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_2/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Detection_Model_fold_2/

Fold 2 Results: {'eval_loss': 0.22058166563510895, 'eval_precision': 0.988860325621251, 'eval_recall': 0.9931153184165232, 'eval_acc': 0.9906645921315848, 'eval_f1': 0.9909832546157149, 'eval_runtime': 48.6988, 'eval_samples_per_second': 184.768, 'eval_steps_per_second': 23.101, 'epoch': 3.0}


Training Fold 3


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.3106,0.242724,0.965051,0.991875,0.977103,0.978279
2,0.224,0.223234,0.985179,0.994869,0.989552,0.99
3,0.2072,0.217699,0.99188,0.992517,0.991886,0.992198



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_3/checkpoint-4499
Configuration saved in Detection_Model_fold_3/checkpoint-4499/config.json
Model weights saved in Detection_Model_fold_3/checkpoint-4499/model.safetensors
tokenizer config file saved in Detection_Model_fold_3/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_3/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_3/checkpoint-8998
Configuration saved in Detection_Model_fold_3/checkpoint-8998/config.json
Model weights saved in Detection_Model_fold_3/checkpoint-8998/model.safetensors
tokenizer config file saved in Detection_Model_fold_3/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_3/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Detection_Model_fold_3/

Fold 3 Results: {'eval_loss': 0.21769903600215912, 'eval_precision': 0.9918803418803419, 'eval_recall': 0.9925165704511439, 'eval_acc': 0.99188618428365, 'eval_f1': 0.9921983541733461, 'eval_runtime': 47.2213, 'eval_samples_per_second': 190.529, 'eval_steps_per_second': 23.824, 'epoch': 3.0}


Training Fold 4


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.3156,0.230321,0.986504,0.98372,0.984773,0.98511
2,0.2244,0.217528,0.987298,0.995442,0.991108,0.991353
3,0.2069,0.212647,0.991998,0.995659,0.993665,0.993825



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_4/checkpoint-4499
Configuration saved in Detection_Model_fold_4/checkpoint-4499/config.json
Model weights saved in Detection_Model_fold_4/checkpoint-4499/model.safetensors
tokenizer config file saved in Detection_Model_fold_4/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_4/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_4/checkpoint-8998
Configuration saved in Detection_Model_fold_4/checkpoint-8998/config.json
Model weights saved in Detection_Model_fold_4/checkpoint-8998/model.safetensors
tokenizer config file saved in Detection_Model_fold_4/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_4/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Detection_Model_fold_4/

Fold 4 Results: {'eval_loss': 0.21264666318893433, 'eval_precision': 0.9919982698961938, 'eval_recall': 0.995658780117213, 'eval_acc': 0.9936645548516172, 'eval_f1': 0.9938251543711407, 'eval_runtime': 47.5638, 'eval_samples_per_second': 189.157, 'eval_steps_per_second': 23.652, 'epoch': 3.0}


Training Fold 5


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494

Epoch,Training Loss,Validation Loss,Precision,Recall,Acc,F1
1,0.3168,0.245956,0.979338,0.980993,0.979104,0.980165
2,0.2222,0.225827,0.984915,0.992819,0.988218,0.988851
3,0.2059,0.218795,0.989903,0.993875,0.991442,0.991885



***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_5/checkpoint-4499
Configuration saved in Detection_Model_fold_5/checkpoint-4499/config.json
Model weights saved in Detection_Model_fold_5/checkpoint-4499/model.safetensors
tokenizer config file saved in Detection_Model_fold_5/checkpoint-4499/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_5/checkpoint-4499/special_tokens_map.json

***** Running Evaluation *****
  Num examples = 8997
  Batch size = 8
Saving model checkpoint to Detection_Model_fold_5/checkpoint-8998
Configuration saved in Detection_Model_fold_5/checkpoint-8998/config.json
Model weights saved in Detection_Model_fold_5/checkpoint-8998/model.safetensors
tokenizer config file saved in Detection_Model_fold_5/checkpoint-8998/tokenizer_config.json
Special tokens file saved in Detection_Model_fold_5/checkpoint-8998/special_tokens_map.json
Deleting older checkpoint [Detection_Model_fold_5/

Fold 5 Results: {'eval_loss': 0.21879464387893677, 'eval_precision': 0.9899032393773665, 'eval_recall': 0.9938753959873284, 'eval_acc': 0.9914415916416583, 'eval_f1': 0.9918853409210665, 'eval_runtime': 48.6429, 'eval_samples_per_second': 184.96, 'eval_steps_per_second': 23.128, 'epoch': 3.0}




NameError: name 'np' is not defined

In [15]:
print("Training Final Model on Entire Dataset")

full_train_df = train_val_df.drop(columns=['id'])
tokenized_dataset_full = Dataset.from_dict(tokenize_function(full_train_df, tokenizer_Bert_uncased))
tokenized_dataset_full = tokenized_dataset_full.add_column("labels", full_train_df["label"].tolist())

final_model = AutoModelForSequenceClassification.from_pretrained(BERT_BASE_UNCASED, num_labels=num_labels)

final_training_args = TrainingArguments(
    output_dir=f"{model_name}_final",
    num_train_epochs=3,  
    learning_rate=4e-5,
    save_strategy="epoch",
    save_total_limit=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    disable_tqdm=False,
    logging_steps=len(tokenized_dataset_full) // batch_size,
    log_level="info",
    label_smoothing_factor=0.1,
    weight_decay=0.01,
    lr_scheduler_type='cosine',
    warmup_steps=500
)

final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=tokenized_dataset_full,
    tokenizer=tokenizer_Bert_uncased,
)

final_trainer.train()

final_trainer.save_model(f"{model_name}_final")

print("Final Model Training Completed")

print("Evaluating Final Model on Test Set")

test_df = test_df.drop(columns=['id']) 
tokenized_test_dataset = Dataset.from_dict(tokenize_function(test_df, tokenizer_Bert_uncased))
tokenized_test_dataset = tokenized_test_dataset.add_column("labels", test_df["label"].tolist())

test_results = final_trainer.evaluate(eval_dataset=tokenized_test_dataset)

print("Test Set Evaluation Results:")
for key, value in test_results.items():
    print(f"{key}: {value}")

predictions = final_trainer.predict(test_dataset=tokenized_test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

Training Final Model on Entire Dataset


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594/config.json
Model config BertConfig {
  "_name_or_path": "google-bert/bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--google-bert--bert-base-uncased/snapshots/86b5e0934494

Step,Training Loss
5623,0.3003
11246,0.2198
16869,0.2039


Saving model checkpoint to Detection_Model_final/checkpoint-5624
Configuration saved in Detection_Model_final/checkpoint-5624/config.json
Model weights saved in Detection_Model_final/checkpoint-5624/model.safetensors
tokenizer config file saved in Detection_Model_final/checkpoint-5624/tokenizer_config.json
Special tokens file saved in Detection_Model_final/checkpoint-5624/special_tokens_map.json
Saving model checkpoint to Detection_Model_final/checkpoint-11248
Configuration saved in Detection_Model_final/checkpoint-11248/config.json
Model weights saved in Detection_Model_final/checkpoint-11248/model.safetensors
tokenizer config file saved in Detection_Model_final/checkpoint-11248/tokenizer_config.json
Special tokens file saved in Detection_Model_final/checkpoint-11248/special_tokens_map.json
Deleting older checkpoint [Detection_Model_final/checkpoint-5624] due to args.save_total_limit
Saving model checkpoint to Detection_Model_final/checkpoint-16872
Configuration saved in Detection_Mod

Final Model Training Completed
Evaluating Final Model on Test Set



***** Running Evaluation *****
  Num examples = 4999
  Batch size = 8



***** Running Prediction *****
  Num examples = 4999
  Batch size = 8


Test Set Evaluation Results:
eval_loss: 0.21292193233966827
eval_runtime: 25.3158
eval_samples_per_second: 197.465
eval_steps_per_second: 24.688
epoch: 3.0


### Evaluation with Evaluate Dataset

In [17]:
ev = pd.read_csv('../Dataset/evaluation.csv')
ev['label'] = ev['BinaryNumTarget']
ev['text'] = ev['tweet']

In [29]:
def tokenize_function(examples):
    return tokenizer_Bert_uncased(examples['text'], padding="max_length", truncation=True)

In [30]:
ev_data = ev[['label', 'text']]
dataset = Dataset.from_pandas(ev_data)
dataset = dataset.map(tokenize_function)

Map:   0%|          | 0/30193 [00:00<?, ? examples/s]

In [35]:
sample_text = ev_data['text'].iloc[0]
sample_prediction = pipe(sample_text)
print("Sample prediction:", sample_prediction)

Sample prediction: [{'label': 'LABEL_0', 'score': 0.9538986086845398}]


In [43]:
pipe = pipeline("text-classification", model=f"{model_name}_final", device=0) # Laden des Modells

loading configuration file Detection_Model_final/config.json
Model config BertConfig {
  "_name_or_path": "Detection_Model_final",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading configuration file Detection_Model_final/config.json
Model config BertConfig {
  "_name_or_path": "Detection_Model_final",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,


Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [None]:
accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

class CustomCombinedMetric:
    def compute(self, predictions=None, references=None, **kwargs):
        accuracy = accuracy_metric.compute(predictions=predictions, references=references, **kwargs)
        precision = precision_metric.compute(predictions=predictions, references=references, **kwargs)
        recall = recall_metric.compute(predictions=predictions, references=references, **kwargs)
        f1 = f1_metric.compute(predictions=predictions, references=references, **kwargs)

        return {
            "accuracy": accuracy["accuracy"],
            "precision": precision["precision"],
            "recall": recall["recall"],
            "f1": f1["f1"]
        }

custom_metric = CustomCombinedMetric()

In [48]:
eval = evaluator("text-classification") # Erstellen eines Evaluators für die Textklassifikation

results = eval.compute(model_or_pipeline=pipe,
                       data=dataset,
                       metric=custom_metric,
                       label_mapping={"LABEL_0": 0, "LABEL_1": 1},
                       ) 
print(results)

{'accuracy': 0.786871129069652, 'precision': 0.7692877422540427, 'recall': 0.8221093595409273, 'f1': 0.7948219239230941, 'total_time_in_seconds': 304.37948868999956, 'samples_per_second': 99.19525172325449, 'latency_in_seconds': 0.010081127701453965}
