In [43]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
! pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [45]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

#### Twitter fake news dataset

In [46]:
train = pd.read_csv('/content/drive/MyDrive/NLP Project/Datasets/dataset CHECKED/train.csv')
train.head()

Unnamed: 0,label,text
0,real,【#武汉卫健委回应集中核酸检测#】#武汉卫健委集中核酸检测十问十答#：①为什么要在全市范围内...
1,real,【#青海连续14天无新增病例#】记者从青海省卫健委获悉，​2月19日0-24时，青海省报告新...
2,real,【人民直播：#湖北疫情防控工作发布会#】目前，江汉方舱医院医疗救治工作进展如何？3月1日下午...
3,real,【#北京全市核酸检测76499人阳性59人#】在北京今天上午召开的新冠肺炎疫情防控工作新闻发...
4,fake,美国的10艘医疗军舰已经开进纽约港，每艘船上有1000张病床，上面拥有所有的医疗抢救设施，每...


In [47]:
test = pd.read_csv('/content/drive/MyDrive/NLP Project/Datasets/dataset CHECKED/test.csv')
test.head()

Unnamed: 0,label,text
0,real,【基因研究：#巴西流行的新冠病毒与欧美样本相似#】巴西奥斯瓦尔多·克鲁斯基金会日前公布一份研...
1,real,【#北京快递小哥外卖骑手全员核酸检测#】据北京日报，北京快递企业、外卖企业近日陆续安排快递员...
2,real,【继续加油！全国#连续5天治愈出院超千人#】从12日至16日，全国连续治愈出院人数分别为11...
3,real,【#民警抗疫一线奋战16天牺牲#】12日，江苏徐州公安局交警支队指导员司元羽连续奋战防疫检查...
4,real,【#全球新冠肺炎超1200万例#】根据美国约翰斯·霍普金斯大学最新统计数据，全球新冠肺炎确诊...


In [48]:
#encode output
label_mapping = {'fake': 1, 'real': 0}
train.label = train.label.map(label_mapping)
test.label = test.label.map(label_mapping)

### Train val split

In [49]:
train = train.dropna()
test = test.dropna()
X_train, X_val, y_train, y_val = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
X_test, y_test = test['text'], test.label

#### Preprocessing the dataset

In [50]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

In [51]:
X_train_encoded = tokenizer(
    list(X_train.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_val_encoded = tokenizer(
    list(X_val.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_test_encoded = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [52]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [53]:
train_dataset = FakeNewsDataset(X_train_encoded, y_train.values)
val_dataset = FakeNewsDataset(X_val_encoded, y_val.values)
test_dataset = FakeNewsDataset(X_test_encoded, y_test.values)
train_loader = DataLoader(train_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

## Training

In [54]:
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
import evaluate
import numpy as np
import os
os.environ['HF_MLFLOW_LOG_ARTIFACTS'] = "1" # save models as artifact for the expirment

In [56]:
def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels) 

In [57]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/NLP Project/saved_models/bert/',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=400,
    evaluation_strategy='steps',
    eval_steps=400,
    load_best_model_at_end=True,
    save_total_limit=3,
    save_steps=400

)

model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [58]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
400,0.1157,0.023889,0.997033,0.990991,1.0,0.982143
800,0.0225,0.018419,0.997033,0.99115,0.982456,1.0


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=845, training_loss=0.06543705824916884, metrics={'train_runtime': 671.4444, 'train_samples_per_second': 10.023, 'train_steps_per_second': 1.258, 'total_flos': 1770737402572800.0, 'train_loss': 0.06543705824916884, 'epoch': 5.0})

### Calculate performance

In [59]:
eval_result = trainer.evaluate(eval_dataset=val_dataset)

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation results:
eval_loss: 0.0184
eval_accuracy: 0.9970
eval_f1: 0.9912
eval_precision: 0.9825
eval_recall: 1.0000
eval_runtime: 15.9181
eval_samples_per_second: 21.1710
eval_steps_per_second: 2.7010
epoch: 5.0000


In [60]:
eval_result = trainer.evaluate(eval_dataset=test_dataset)

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation results:
eval_loss: 0.0196
eval_accuracy: 0.9976
eval_f1: 0.9924
eval_precision: 0.9848
eval_recall: 1.0000
eval_runtime: 18.4140
eval_samples_per_second: 22.8630
eval_steps_per_second: 2.8780
epoch: 5.0000
