In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

In [None]:
train = pd.read_csv('/content/drive/MyDrive/NLP Project/Datasets/AFND/train.csv')
train.head()

Unnamed: 0,text,label
0,ايران تبدا اختبار انظمة التبريد مفاعل اراكستخت...,0
1,بومبيو استبعد ترشحي انتخابات الراسة الامريكية ...,0
2,هبوطا باسعار النفط عقب اعلان تعويم السفينة الب...,0
3,عودة المهاجرين فرنسا ستكون وفق الاطر القانونية...,1
4,ريس الوزرا يتفقد مركز لقاحات كورونا بارض المعا...,0


In [None]:
test = pd.read_csv('/content/drive/MyDrive/NLP Project/Datasets/AFND/test.csv')
test.head()

Unnamed: 0,text,label
0,وفيات الثلاثا اقرا ايضا وفيات وعشرات الاصابات ...,1
1,الضمان مدرسة تعهدت بتقديم رسوم سواليف قال النا...,1
2,اقل الف وفاة يومية بكورونا امريكا للمرة الاولي...,1
3,تتدخل مسال اجتماعية تعنيكعلق الامين العام السا...,1
4,وفاة الامير فيليب زوج ملكة بريطانيا الملكة الي...,1


In [None]:
label_mapping = {1:'fake', 0:'real'}
train.label = train.label.map(label_mapping)
test.label = test.label.map(label_mapping)

In [None]:
#encode output
label_mapping = {'fake': 1, 'real': 0}
train.label = train.label.map(label_mapping)
test.label = test.label.map(label_mapping)

### Train val split

In [None]:
train = train.dropna()
test = test.dropna()
X_train, X_val, y_train, y_val = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
X_test, y_test = test['text'], test.label

#### Preprocessing the dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabert')

In [None]:
X_train_encoded = tokenizer(
    list(X_train.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_val_encoded = tokenizer(
    list(X_val.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_test_encoded = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [None]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = FakeNewsDataset(X_train_encoded, y_train.values)
val_dataset = FakeNewsDataset(X_val_encoded, y_val.values)
test_dataset = FakeNewsDataset(X_test_encoded, y_test.values)
train_loader = DataLoader(train_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

## Training

In [None]:
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
import evaluate
import numpy as np
import os
os.environ['HF_MLFLOW_LOG_ARTIFACTS'] = "1" # save models as artifact for the expirment

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/NLP Project/saved_models/bert/',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=400,
    evaluation_strategy='steps',
    eval_steps=400,
    load_best_model_at_end=True,
    save_total_limit=3,
    save_steps=400

)

model = BertForSequenceClassification.from_pretrained('aubmindlab/bert-base-arabert', num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)


Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at aubmindlab/bert-base-arabert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [None]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
400,0.6943,0.641899,0.616484,0.56546,0.645122,0.503309
800,0.6412,0.661642,0.649844,0.53727,0.779042,0.410022
1200,0.6297,0.576233,0.715469,0.714129,0.711448,0.716829
1600,0.6182,0.563904,0.683359,0.580825,0.84502,0.442483
2000,0.5992,0.606101,0.719297,0.690339,0.761841,0.631106
2400,0.6032,0.583255,0.738594,0.733046,0.742405,0.723921
2800,0.5886,0.570622,0.708828,0.743337,0.660183,0.850457
3200,0.5846,0.573744,0.737187,0.730923,0.742203,0.719981
3600,0.5772,0.581289,0.730313,0.736247,0.714625,0.759218
4000,0.6078,0.706942,0.496094,0.662727,0.495931,0.998582


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encod

TrainOutput(global_step=32000, training_loss=0.659716462135315, metrics={'train_runtime': 57697.0652, 'train_samples_per_second': 4.437, 'train_steps_per_second': 0.555, 'total_flos': 6.735643017216e+16, 'train_loss': 0.659716462135315, 'epoch': 5.0})

### Calculate performance

In [None]:
model = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/NLP Project/saved_models/bert/checkpoint-32000/')

In [None]:
eval_result = trainer.evaluate(eval_dataset=val_dataset)

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation results:
eval_loss: 0.5639
eval_accuracy: 0.6834
eval_f1: 0.5808
eval_precision: 0.8450
eval_recall: 0.4425
eval_runtime: 416.3875
eval_samples_per_second: 30.7410
eval_steps_per_second: 3.8430
epoch: 5.0000


In [None]:
eval_result = trainer.evaluate(eval_dataset=test_dataset)

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation results:
eval_loss: 0.5588
eval_accuracy: 0.6824
eval_f1: 0.5842
eval_precision: 0.8500
eval_recall: 0.4451
eval_runtime: 524.5172
eval_samples_per_second: 30.5040
eval_steps_per_second: 3.8130
epoch: 5.0000
