In [1]:
import pandas as pd

finetuned_dirname = "distilbert-base-finetuned-phemernr2-rnr"

data = pd.read_csv("../../data/phemernr2_dataset_with_tvt.csv", sep=",")
# data = data[['tweet_text', 'tvt2', 'label2']]
# data.columns = ['tweet_text', 'tvt2', 'label']
print(data.shape)
data.head()

(6425, 5)


Unnamed: 0,tweet_id,tweet_text,label,tvt2,tvt2_1
0,552833795142209536,the east london mosque would like to offer its...,non-rumours,training,training
1,580318210609696769,breaking - a germanwings airbus a320 plane rep...,true,validation,testting
2,552798891994009601,reports that two of the dead in the #charliehe...,true,training,training
3,576790814942236672,after #putin disappeared russian tv no longer ...,non-rumours,validation,training
4,499678822598340608,saw #ferguson for myself. #justiceformichaelbr...,non-rumours,testting,testting


In [2]:
data['label'] = data['label'].replace(['true', 'unverfied', 'false'], 'rumors')

In [3]:
combined_data = data

In [4]:
import torch

class CustomTextDataset(torch.utils.data.dataset.Dataset):

    def __init__(self, texts, labels):
        self.labels = labels
        self.texts = texts
        self.attention_mask = None
        self.input_ids = None
        self.token_type_ids = None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = {
            "text": self.texts[idx],
            "label": self.labels[idx],
            "attention_mask": self.attention_mask[idx] if self.attention_mask else None,
            "input_ids": self.input_ids[idx] if self.input_ids else None,
        }
        return sample
    
    def tokenize(self, tokenizer):
        self.attention_mask = []
        self.input_ids = []
        self.token_type_ids = []

        for text in self.texts:
            token = tokenizer(text, padding="max_length", truncation=True)
            
            self.attention_mask.append(token['attention_mask'])
            self.input_ids.append(token['input_ids'])

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
labels = []

labels_str = combined_data['label'].unique().tolist()
for i, d in combined_data.iterrows():
    if d['label'] == "non-rumours":
        lab = 1
    else:
        lab = 0
    labels.append(lab)
    
print(len(labels))
labels[:10]

6425


[1, 0, 0, 1, 1, 1, 1, 0, 1, 1]

In [6]:
train_dataset = CustomTextDataset(
    [d['tweet_text'] for i, d in combined_data.iterrows() if d['tvt2'] == 'training'],
    [labels[i] for i, d in combined_data.iterrows() if d['tvt2'] == 'training'])
test_dataset = CustomTextDataset(
    [d['tweet_text'] for i, d in combined_data.iterrows() if d['tvt2'] == 'validation'],
    [labels[i] for i, d in combined_data.iterrows() if d['tvt2'] == 'validation'])
train_dataset[0]

{'text': 'the east london mosque would like to offer its sincere condolences to the families of those killed during the #charliehebdo attacks (1/2)',
 'label': 1,
 'attention_mask': None,
 'input_ids': None}

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [8]:
# inputs = tokenizer(["you're stuck in a timewrap from 2004 though", "summa lumma dumma lumma"], padding="max_length", truncation=True)
# for k,v in inputs.items():
#     print(k)

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset.tokenize(tokenizer)
test_dataset.tokenize(tokenizer)

In [10]:
print(len(train_dataset))
print(len(test_dataset))

4336
1462


### Fine Tuning

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased",
                                                           output_hidden_states=False,
                                                           num_labels=2)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.wei

In [12]:
from transformers import TrainingArguments

epochs = 20
batch_size = 8
save_steps = (round((len(train_dataset)/batch_size) + 0.49)) * epochs
# save_steps = 1_000_000

training_args = TrainingArguments(
    output_dir=f"../../data/models/{finetuned_dirname}",
    num_train_epochs=epochs,
    save_steps=save_steps,
    logging_steps=300,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)

print(f"Save Steps : {save_steps}")

Save Steps : 10840


In [13]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [15]:
import time

start = time.time()

trainer.train()

print(f"Execution Time : {round(time.time() - start)} seconds")

***** Running training *****
  Num examples = 4336
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10840
  Number of trainable parameters = 65783042
  0%|          | 0/10840 [00:00<?, ?it/s]The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
  3%|▎         | 301/10840 [00:46<26:44,  6.57it/s]

{'loss': 0.5141, 'learning_rate': 9.723247232472326e-06, 'epoch': 0.55}


  6%|▌         | 601/10840 [01:31<25:47,  6.62it/s]

{'loss': 0.393, 'learning_rate': 9.44649446494465e-06, 'epoch': 1.11}


  8%|▊         | 901/10840 [02:17<25:14,  6.56it/s]

{'loss': 0.3334, 'learning_rate': 9.169741697416974e-06, 'epoch': 1.66}


 11%|█         | 1201/10840 [03:02<24:27,  6.57it/s]

{'loss': 0.2829, 'learning_rate': 8.892988929889298e-06, 'epoch': 2.21}


 14%|█▍        | 1501/10840 [03:48<23:47,  6.54it/s]

{'loss': 0.2529, 'learning_rate': 8.616236162361624e-06, 'epoch': 2.77}


 17%|█▋        | 1801/10840 [04:35<24:16,  6.21it/s]

{'loss': 0.1909, 'learning_rate': 8.33948339483395e-06, 'epoch': 3.32}


 19%|█▉        | 2101/10840 [05:24<23:30,  6.19it/s]

{'loss': 0.1649, 'learning_rate': 8.062730627306274e-06, 'epoch': 3.87}


 22%|██▏       | 2401/10840 [06:12<22:46,  6.18it/s]

{'loss': 0.1032, 'learning_rate': 7.785977859778598e-06, 'epoch': 4.43}


 25%|██▍       | 2701/10840 [07:01<22:33,  6.01it/s]

{'loss': 0.0993, 'learning_rate': 7.509225092250923e-06, 'epoch': 4.98}


 28%|██▊       | 3001/10840 [07:50<21:03,  6.20it/s]

{'loss': 0.0636, 'learning_rate': 7.232472324723247e-06, 'epoch': 5.54}


 30%|███       | 3301/10840 [08:37<19:08,  6.56it/s]

{'loss': 0.0623, 'learning_rate': 6.955719557195573e-06, 'epoch': 6.09}


 33%|███▎      | 3601/10840 [09:22<18:16,  6.60it/s]

{'loss': 0.0406, 'learning_rate': 6.678966789667897e-06, 'epoch': 6.64}


 36%|███▌      | 3901/10840 [10:08<17:43,  6.52it/s]

{'loss': 0.0494, 'learning_rate': 6.402214022140222e-06, 'epoch': 7.2}


 39%|███▉      | 4201/10840 [10:54<16:58,  6.52it/s]

{'loss': 0.027, 'learning_rate': 6.125461254612547e-06, 'epoch': 7.75}


 42%|████▏     | 4501/10840 [11:39<16:10,  6.53it/s]

{'loss': 0.0247, 'learning_rate': 5.8487084870848706e-06, 'epoch': 8.3}


 44%|████▍     | 4801/10840 [12:25<15:25,  6.53it/s]

{'loss': 0.0376, 'learning_rate': 5.571955719557196e-06, 'epoch': 8.86}


 47%|████▋     | 5101/10840 [13:11<14:34,  6.56it/s]

{'loss': 0.0266, 'learning_rate': 5.295202952029521e-06, 'epoch': 9.41}


 50%|████▉     | 5401/10840 [13:56<13:53,  6.53it/s]

{'loss': 0.0189, 'learning_rate': 5.018450184501845e-06, 'epoch': 9.96}


 53%|█████▎    | 5701/10840 [14:42<13:06,  6.53it/s]

{'loss': 0.0139, 'learning_rate': 4.741697416974171e-06, 'epoch': 10.52}


 55%|█████▌    | 6001/10840 [15:28<12:22,  6.52it/s]

{'loss': 0.028, 'learning_rate': 4.464944649446495e-06, 'epoch': 11.07}


 58%|█████▊    | 6301/10840 [16:14<11:34,  6.53it/s]

{'loss': 0.0184, 'learning_rate': 4.1881918819188195e-06, 'epoch': 11.62}


 61%|██████    | 6601/10840 [16:59<10:43,  6.59it/s]

{'loss': 0.0155, 'learning_rate': 3.911439114391144e-06, 'epoch': 12.18}


 64%|██████▎   | 6901/10840 [17:45<09:58,  6.58it/s]

{'loss': 0.012, 'learning_rate': 3.634686346863469e-06, 'epoch': 12.73}


 66%|██████▋   | 7201/10840 [18:30<09:17,  6.53it/s]

{'loss': 0.0042, 'learning_rate': 3.3579335793357935e-06, 'epoch': 13.28}


 69%|██████▉   | 7501/10840 [19:16<08:28,  6.56it/s]

{'loss': 0.0145, 'learning_rate': 3.0811808118081183e-06, 'epoch': 13.84}


 72%|███████▏  | 7801/10840 [20:02<07:48,  6.48it/s]

{'loss': 0.0152, 'learning_rate': 2.8044280442804427e-06, 'epoch': 14.39}


 75%|███████▍  | 8101/10840 [20:47<06:58,  6.54it/s]

{'loss': 0.0136, 'learning_rate': 2.527675276752768e-06, 'epoch': 14.94}


 78%|███████▊  | 8401/10840 [21:33<06:11,  6.56it/s]

{'loss': 0.0044, 'learning_rate': 2.2509225092250924e-06, 'epoch': 15.5}


 80%|████████  | 8701/10840 [22:19<05:26,  6.55it/s]

{'loss': 0.0034, 'learning_rate': 1.974169741697417e-06, 'epoch': 16.05}


 83%|████████▎ | 9001/10840 [23:05<04:42,  6.51it/s]

{'loss': 0.0057, 'learning_rate': 1.6974169741697418e-06, 'epoch': 16.61}


 86%|████████▌ | 9301/10840 [23:50<03:55,  6.54it/s]

{'loss': 0.0047, 'learning_rate': 1.4206642066420664e-06, 'epoch': 17.16}


 89%|████████▊ | 9601/10840 [24:36<03:10,  6.52it/s]

{'loss': 0.0005, 'learning_rate': 1.1439114391143912e-06, 'epoch': 17.71}


 91%|█████████▏| 9901/10840 [25:22<02:23,  6.56it/s]

{'loss': 0.005, 'learning_rate': 8.671586715867159e-07, 'epoch': 18.27}


 94%|█████████▍| 10201/10840 [26:07<01:37,  6.54it/s]

{'loss': 0.0037, 'learning_rate': 5.904059040590407e-07, 'epoch': 18.82}


 97%|█████████▋| 10501/10840 [26:53<00:51,  6.54it/s]

{'loss': 0.0021, 'learning_rate': 3.136531365313653e-07, 'epoch': 19.37}


100%|█████████▉| 10801/10840 [27:38<00:05,  6.53it/s]

{'loss': 0.0007, 'learning_rate': 3.690036900369004e-08, 'epoch': 19.93}


100%|██████████| 10840/10840 [27:44<00:00,  6.59it/s]Saving model checkpoint to ../../data/models/distilbert-base-finetuned-phemernr2-rnr\checkpoint-10840
Configuration saved in ../../data/models/distilbert-base-finetuned-phemernr2-rnr\checkpoint-10840\config.json
Model weights saved in ../../data/models/distilbert-base-finetuned-phemernr2-rnr\checkpoint-10840\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 10840/10840 [27:45<00:00,  6.51it/s]

{'train_runtime': 1665.9109, 'train_samples_per_second': 52.056, 'train_steps_per_second': 6.507, 'train_loss': 0.07890130751441397, 'epoch': 20.0}
Execution Time : 1666 seconds





In [16]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1462
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
100%|██████████| 183/183 [00:08<00:00, 21.14it/s]


{'eval_loss': 1.231092095375061,
 'eval_accuracy': 0.8679890560875513,
 'eval_runtime': 8.7058,
 'eval_samples_per_second': 167.935,
 'eval_steps_per_second': 21.021,
 'epoch': 20.0}