In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.model_selection import train_test_split

# Twitter 2015

#### Twitter fake news dataset

In [4]:
train = pd.read_csv('/content/drive/MyDrive/NLP Project/Datasets/dataset CHECKED/train.csv')
train.head()

Unnamed: 0,label,text
0,real,【#武汉卫健委回应集中核酸检测#】#武汉卫健委集中核酸检测十问十答#：①为什么要在全市范围内...
1,real,【#青海连续14天无新增病例#】记者从青海省卫健委获悉，​2月19日0-24时，青海省报告新...
2,real,【人民直播：#湖北疫情防控工作发布会#】目前，江汉方舱医院医疗救治工作进展如何？3月1日下午...
3,real,【#北京全市核酸检测76499人阳性59人#】在北京今天上午召开的新冠肺炎疫情防控工作新闻发...
4,fake,美国的10艘医疗军舰已经开进纽约港，每艘船上有1000张病床，上面拥有所有的医疗抢救设施，每...


In [5]:
test = pd.read_csv('/content/drive/MyDrive/NLP Project/Datasets/dataset CHECKED/test.csv')
test.head()

Unnamed: 0,label,text
0,real,【基因研究：#巴西流行的新冠病毒与欧美样本相似#】巴西奥斯瓦尔多·克鲁斯基金会日前公布一份研...
1,real,【#北京快递小哥外卖骑手全员核酸检测#】据北京日报，北京快递企业、外卖企业近日陆续安排快递员...
2,real,【继续加油！全国#连续5天治愈出院超千人#】从12日至16日，全国连续治愈出院人数分别为11...
3,real,【#民警抗疫一线奋战16天牺牲#】12日，江苏徐州公安局交警支队指导员司元羽连续奋战防疫检查...
4,real,【#全球新冠肺炎超1200万例#】根据美国约翰斯·霍普金斯大学最新统计数据，全球新冠肺炎确诊...


In [6]:
#encode output
label_mapping = {'fake': 1, 'real': 0}
train.label = train.label.map(label_mapping)
test.label = test.label.map(label_mapping)

### Train val split

In [7]:
train = train.dropna()
test = test.dropna()
X_train, X_val, y_train, y_val = train_test_split(train['text'], train['label'], test_size=0.2, random_state=42)
X_test, y_test = test['text'], test.label

#### Preprocessing the dataset

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [9]:
X_train_encoded = tokenizer(
    list(X_train.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_val_encoded = tokenizer(
    list(X_val.values),
    padding=True,
    truncation=True,
    return_tensors='pt'
)
X_test_encoded = tokenizer(
    list(X_test),
    padding=True,
    truncation=True,
    return_tensors='pt'
)

In [10]:
class FakeNewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
train_dataset = FakeNewsDataset(X_train_encoded, y_train.values)
val_dataset = FakeNewsDataset(X_val_encoded, y_val.values)
test_dataset = FakeNewsDataset(X_test_encoded, y_test.values)
train_loader = DataLoader(train_dataset, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

## Training

In [12]:
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification
import evaluate
import numpy as np
import os
os.environ['HF_MLFLOW_LOG_ARTIFACTS'] = "1" # save models as artifact for the expirment

In [14]:
def compute_metrics(eval_preds):
    metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels) 

In [15]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/NLP Project/saved_models/distilbert/',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=400,
    evaluation_strategy='steps',
    eval_steps=400,
    load_best_model_at_end=True,
    save_total_limit=3,
    save_steps=400

)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics
)


Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.

In [16]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
400,0.1578,0.092299,0.985163,0.954955,0.963636,0.946429
800,0.0381,0.105422,0.982196,0.946429,0.946429,0.946429


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=845, training_loss=0.09272905015248871, metrics={'train_runtime': 362.8877, 'train_samples_per_second': 18.546, 'train_steps_per_second': 2.329, 'total_flos': 891505592954880.0, 'train_loss': 0.09272905015248871, 'epoch': 5.0})

### Calculate performance

In [17]:
eval_result = trainer.evaluate(eval_dataset=val_dataset)

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation results:
eval_loss: 0.0923
eval_accuracy: 0.9852
eval_f1: 0.9550
eval_precision: 0.9636
eval_recall: 0.9464
eval_runtime: 6.5305
eval_samples_per_second: 51.6040
eval_steps_per_second: 6.5850
epoch: 5.0000


In [18]:
eval_result = trainer.evaluate(eval_dataset=test_dataset)

# Print the evaluation results
print("Evaluation results:")
for key, value in eval_result.items():
    print(f"{key}: {value:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation results:
eval_loss: 0.0460
eval_accuracy: 0.9929
eval_f1: 0.9771
eval_precision: 0.9697
eval_recall: 0.9846
eval_runtime: 7.7818
eval_samples_per_second: 54.1000
eval_steps_per_second: 6.8110
epoch: 5.0000
