In [1]:
import pandas as pd

finetuned_dirname = "20-epoch-roberta-finetuned-phemernr2-rnr"

data = pd.read_csv("../../data/phemernr2_dataset_with_tvt.csv", sep=",")
# data = data[['tweet_text', 'tvt2', 'label2']]
# data.columns = ['tweet_text', 'tvt2', 'label']
print(data.shape)
data.head()

(6425, 5)


Unnamed: 0,tweet_id,tweet_text,label,tvt2,tvt2_1
0,552833795142209536,the east london mosque would like to offer its...,non-rumours,training,training
1,580318210609696769,breaking - a germanwings airbus a320 plane rep...,true,validation,testting
2,552798891994009601,reports that two of the dead in the #charliehe...,true,training,training
3,576790814942236672,after #putin disappeared russian tv no longer ...,non-rumours,validation,training
4,499678822598340608,saw #ferguson for myself. #justiceformichaelbr...,non-rumours,testting,testting


In [2]:
data['label'] = data['label'].replace(['true', 'unverfied', 'false'], 'rumors')

In [3]:
combined_data = data

In [4]:
import torch

class CustomTextDataset(torch.utils.data.dataset.Dataset):

    def __init__(self, texts, labels):
        self.labels = labels
        self.texts = texts
        self.attention_mask = None
        self.input_ids = None
        self.token_type_ids = None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = {
            "text": self.texts[idx],
            "label": self.labels[idx],
            "attention_mask": self.attention_mask[idx] if self.attention_mask else None,
            "input_ids": self.input_ids[idx] if self.input_ids else None,
        }
        return sample
    
    def tokenize(self, tokenizer):
        self.attention_mask = []
        self.input_ids = []
        self.token_type_ids = []

        for text in self.texts:
            token = tokenizer(text, padding="max_length", truncation=True)
            
            self.attention_mask.append(token['attention_mask'])
            self.input_ids.append(token['input_ids'])

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
labels = []

labels_str = combined_data['label'].unique().tolist()
for i, d in combined_data.iterrows():
    if d['label'] == "non-rumours":
        lab = 1
    else:
        lab = 0
    labels.append(lab)
    
print(len(labels))
labels[:10]

6425


[1, 0, 0, 1, 1, 1, 1, 0, 1, 1]

In [6]:
train_dataset = CustomTextDataset(
    [d['tweet_text'] for i, d in combined_data.iterrows() if d['tvt2'] == 'training'],
    [labels[i] for i, d in combined_data.iterrows() if d['tvt2'] == 'training'])
test_dataset = CustomTextDataset(
    [d['tweet_text'] for i, d in combined_data.iterrows() if d['tvt2'] == 'validation'],
    [labels[i] for i, d in combined_data.iterrows() if d['tvt2'] == 'validation'])
train_dataset[0]

{'text': 'the east london mosque would like to offer its sincere condolences to the families of those killed during the #charliehebdo attacks (1/2)',
 'label': 1,
 'attention_mask': None,
 'input_ids': None}

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [8]:
# inputs = tokenizer(["you're stuck in a timewrap from 2004 though", "summa lumma dumma lumma"], padding="max_length", truncation=True)
# for k,v in inputs.items():
#     print(k)

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset.tokenize(tokenizer)
test_dataset.tokenize(tokenizer)

In [10]:
print(len(train_dataset))
print(len(test_dataset))

4336
1462


### Fine Tuning

In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("roberta-base",
                                                           output_hidden_states=False,
                                                           num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import TrainingArguments

epochs = 20
batch_size = 8
save_steps = (round((len(train_dataset)/batch_size) + 0.49)) * epochs
# save_steps = 1_000_000

training_args = TrainingArguments(
    output_dir=f"../../data/models/{finetuned_dirname}",
    num_train_epochs=epochs,
    save_steps=save_steps,
    logging_steps=300,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_strategy="epoch"
)

print(f"Save Steps : {save_steps}")

Save Steps : 10840


In [13]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [14]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [15]:
import time

start = time.time()

trainer.train()

print(f"Execution Time : {round(time.time() - start)} seconds")

  5%|▌         | 542/10840 [02:46<52:45,  3.25it/s]

{'loss': 0.4714, 'learning_rate': 9.5e-06, 'epoch': 1.0}


                                                   
  5%|▌         | 542/10840 [03:03<52:45,  3.25it/s]

{'eval_loss': 0.42061421275138855, 'eval_accuracy': 0.8392612859097127, 'eval_runtime': 16.73, 'eval_samples_per_second': 87.388, 'eval_steps_per_second': 10.938, 'epoch': 1.0}


 10%|█         | 1084/10840 [05:48<49:33,  3.28it/s]  

{'loss': 0.3675, 'learning_rate': 9e-06, 'epoch': 2.0}


                                                    
 10%|█         | 1084/10840 [06:05<49:33,  3.28it/s]

{'eval_loss': 0.3693391680717468, 'eval_accuracy': 0.859781121751026, 'eval_runtime': 16.762, 'eval_samples_per_second': 87.221, 'eval_steps_per_second': 10.918, 'epoch': 2.0}


 15%|█▌        | 1626/10840 [08:51<47:45,  3.22it/s]   

{'loss': 0.2991, 'learning_rate': 8.5e-06, 'epoch': 3.0}


                                                    
 15%|█▌        | 1626/10840 [09:08<47:45,  3.22it/s]

{'eval_loss': 0.4314260482788086, 'eval_accuracy': 0.884404924760602, 'eval_runtime': 16.7753, 'eval_samples_per_second': 87.152, 'eval_steps_per_second': 10.909, 'epoch': 3.0}


 20%|██        | 2168/10840 [11:54<44:40,  3.23it/s]   

{'loss': 0.2321, 'learning_rate': 8.000000000000001e-06, 'epoch': 4.0}


                                                    
 20%|██        | 2168/10840 [12:11<44:40,  3.23it/s]

{'eval_loss': 0.6248192191123962, 'eval_accuracy': 0.8632010943912448, 'eval_runtime': 16.8565, 'eval_samples_per_second': 86.732, 'eval_steps_per_second': 10.856, 'epoch': 4.0}


 25%|██▌       | 2710/10840 [14:57<41:24,  3.27it/s]   

{'loss': 0.159, 'learning_rate': 7.500000000000001e-06, 'epoch': 5.0}


                                                    
 25%|██▌       | 2710/10840 [15:14<41:24,  3.27it/s]

{'eval_loss': 0.6204315423965454, 'eval_accuracy': 0.8905608755129959, 'eval_runtime': 16.7643, 'eval_samples_per_second': 87.209, 'eval_steps_per_second': 10.916, 'epoch': 5.0}


 30%|███       | 3252/10840 [18:00<38:31,  3.28it/s]   

{'loss': 0.117, 'learning_rate': 7e-06, 'epoch': 6.0}


                                                    
 30%|███       | 3252/10840 [18:17<38:31,  3.28it/s]

{'eval_loss': 0.7819859385490417, 'eval_accuracy': 0.874829001367989, 'eval_runtime': 16.7649, 'eval_samples_per_second': 87.206, 'eval_steps_per_second': 10.916, 'epoch': 6.0}


 35%|███▌      | 3794/10840 [21:03<35:55,  3.27it/s]   

{'loss': 0.0828, 'learning_rate': 6.5000000000000004e-06, 'epoch': 7.0}


                                                    
 35%|███▌      | 3794/10840 [21:20<35:55,  3.27it/s]

{'eval_loss': 0.8589742183685303, 'eval_accuracy': 0.872093023255814, 'eval_runtime': 16.7909, 'eval_samples_per_second': 87.071, 'eval_steps_per_second': 10.899, 'epoch': 7.0}


 40%|████      | 4336/10840 [24:07<33:50,  3.20it/s]   

{'loss': 0.0575, 'learning_rate': 6e-06, 'epoch': 8.0}


                                                    
 40%|████      | 4336/10840 [24:23<33:50,  3.20it/s]

{'eval_loss': 0.957779049873352, 'eval_accuracy': 0.8768809849521204, 'eval_runtime': 16.8063, 'eval_samples_per_second': 86.991, 'eval_steps_per_second': 10.889, 'epoch': 8.0}


 45%|████▌     | 4878/10840 [27:10<30:28,  3.26it/s]  

{'loss': 0.0459, 'learning_rate': 5.500000000000001e-06, 'epoch': 9.0}


                                                    
 45%|████▌     | 4878/10840 [27:27<30:28,  3.26it/s]

{'eval_loss': 1.0177148580551147, 'eval_accuracy': 0.8775649794801642, 'eval_runtime': 16.7585, 'eval_samples_per_second': 87.239, 'eval_steps_per_second': 10.92, 'epoch': 9.0}


 50%|█████     | 5420/10840 [30:13<27:52,  3.24it/s]  

{'loss': 0.0438, 'learning_rate': 5e-06, 'epoch': 10.0}


                                                    
 50%|█████     | 5420/10840 [30:30<27:52,  3.24it/s]

{'eval_loss': 0.9731283187866211, 'eval_accuracy': 0.8837209302325582, 'eval_runtime': 16.7974, 'eval_samples_per_second': 87.037, 'eval_steps_per_second': 10.895, 'epoch': 10.0}


 55%|█████▌    | 5962/10840 [33:17<25:08,  3.23it/s]  

{'loss': 0.027, 'learning_rate': 4.5e-06, 'epoch': 11.0}


                                                    
 55%|█████▌    | 5962/10840 [33:34<25:08,  3.23it/s]

{'eval_loss': 1.0612434148788452, 'eval_accuracy': 0.8850889192886456, 'eval_runtime': 16.784, 'eval_samples_per_second': 87.107, 'eval_steps_per_second': 10.903, 'epoch': 11.0}


 60%|██████    | 6504/10840 [36:20<22:36,  3.20it/s]  

{'loss': 0.0238, 'learning_rate': 4.000000000000001e-06, 'epoch': 12.0}


                                                    
 60%|██████    | 6504/10840 [36:37<22:36,  3.20it/s]

{'eval_loss': 1.2270673513412476, 'eval_accuracy': 0.8700410396716827, 'eval_runtime': 16.776, 'eval_samples_per_second': 87.148, 'eval_steps_per_second': 10.908, 'epoch': 12.0}


 65%|██████▌   | 7046/10840 [39:24<19:22,  3.26it/s]  

{'loss': 0.0311, 'learning_rate': 3.5e-06, 'epoch': 13.0}


                                                    
 65%|██████▌   | 7046/10840 [39:41<19:22,  3.26it/s]

{'eval_loss': 1.110488772392273, 'eval_accuracy': 0.8816689466484268, 'eval_runtime': 16.7673, 'eval_samples_per_second': 87.193, 'eval_steps_per_second': 10.914, 'epoch': 13.0}


 70%|███████   | 7588/10840 [42:27<16:35,  3.27it/s]  

{'loss': 0.0232, 'learning_rate': 3e-06, 'epoch': 14.0}


                                                    
 70%|███████   | 7588/10840 [42:44<16:35,  3.27it/s]

{'eval_loss': 1.1665211915969849, 'eval_accuracy': 0.8714090287277702, 'eval_runtime': 16.792, 'eval_samples_per_second': 87.065, 'eval_steps_per_second': 10.898, 'epoch': 14.0}


 75%|███████▌  | 8130/10840 [45:31<13:46,  3.28it/s]  

{'loss': 0.0136, 'learning_rate': 2.5e-06, 'epoch': 15.0}


                                                    
 75%|███████▌  | 8130/10840 [45:47<13:46,  3.28it/s]

{'eval_loss': 1.1477108001708984, 'eval_accuracy': 0.8830369357045144, 'eval_runtime': 16.7404, 'eval_samples_per_second': 87.334, 'eval_steps_per_second': 10.932, 'epoch': 15.0}


 80%|████████  | 8672/10840 [48:33<11:17,  3.20it/s]  

{'loss': 0.0129, 'learning_rate': 2.0000000000000003e-06, 'epoch': 16.0}


                                                    
 80%|████████  | 8672/10840 [48:50<11:17,  3.20it/s]

{'eval_loss': 1.1054612398147583, 'eval_accuracy': 0.8864569083447332, 'eval_runtime': 16.7875, 'eval_samples_per_second': 87.088, 'eval_steps_per_second': 10.901, 'epoch': 16.0}


 85%|████████▌ | 9214/10840 [51:37<08:15,  3.28it/s]  

{'loss': 0.0139, 'learning_rate': 1.5e-06, 'epoch': 17.0}


                                                    
 85%|████████▌ | 9214/10840 [51:53<08:15,  3.28it/s]

{'eval_loss': 1.2362123727798462, 'eval_accuracy': 0.874829001367989, 'eval_runtime': 16.7585, 'eval_samples_per_second': 87.239, 'eval_steps_per_second': 10.92, 'epoch': 17.0}


 90%|█████████ | 9756/10840 [54:40<05:37,  3.21it/s]  

{'loss': 0.0087, 'learning_rate': 1.0000000000000002e-06, 'epoch': 18.0}


                                                    
 90%|█████████ | 9756/10840 [54:57<05:37,  3.21it/s]

{'eval_loss': 1.2058629989624023, 'eval_accuracy': 0.8830369357045144, 'eval_runtime': 16.7747, 'eval_samples_per_second': 87.155, 'eval_steps_per_second': 10.909, 'epoch': 18.0}


 95%|█████████▌| 10298/10840 [57:44<02:46,  3.26it/s] 

{'loss': 0.0079, 'learning_rate': 5.000000000000001e-07, 'epoch': 19.0}


                                                     
 95%|█████████▌| 10298/10840 [58:01<02:46,  3.26it/s]

{'eval_loss': 1.2166485786437988, 'eval_accuracy': 0.8809849521203831, 'eval_runtime': 16.8467, 'eval_samples_per_second': 86.783, 'eval_steps_per_second': 10.863, 'epoch': 19.0}


100%|██████████| 10840/10840 [1:00:57<00:00,  3.10it/s]

{'loss': 0.0072, 'learning_rate': 0.0, 'epoch': 20.0}


                                                       
100%|██████████| 10840/10840 [1:01:15<00:00,  2.95it/s]

{'eval_loss': 1.1956737041473389, 'eval_accuracy': 0.8823529411764706, 'eval_runtime': 17.7531, 'eval_samples_per_second': 82.352, 'eval_steps_per_second': 10.308, 'epoch': 20.0}
{'train_runtime': 3675.0034, 'train_samples_per_second': 23.597, 'train_steps_per_second': 2.95, 'train_loss': 0.10226292942282898, 'epoch': 20.0}
Execution Time : 3675 seconds





In [16]:
trainer.evaluate()

100%|██████████| 183/183 [00:17<00:00, 10.35it/s]


{'eval_loss': 1.1956737041473389,
 'eval_accuracy': 0.8823529411764706,
 'eval_runtime': 17.782,
 'eval_samples_per_second': 82.218,
 'eval_steps_per_second': 10.291,
 'epoch': 20.0}