In [1]:
import pandas as pd

finetuned_dirname = "distilbert-base-finetuned-phemernr2-tf"

data = pd.read_csv("../../data/processed/phemernr2-tf_dataset.csv", sep=",")
data = data[['tweet_text', 'tvt2', 'label']]
data['tweet_text'] = data['tweet_text'].str.lower()
print(data.shape)
data.head()

(1705, 3)


Unnamed: 0,tweet_text,tvt2,label
0,breaking - a germanwings airbus a320 plane rep...,training,True
1,reports that two of the dead in the #charliehe...,training,True
2,'no survivors' in #germanwings crash says fren...,training,False
3,tragedy mounts as soldier shot this am dies of...,training,True
4,watch the moment gunfire and explosions were h...,training,True


In [2]:
combined_data = data

In [3]:
import torch

class CustomTextDataset(torch.utils.data.dataset.Dataset):

    def __init__(self, texts, labels):
        self.labels = labels
        self.texts = texts
        self.attention_mask = None
        self.input_ids = None
        self.token_type_ids = None

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sample = {
            "text": self.texts[idx],
            "label": self.labels[idx],
            "attention_mask": self.attention_mask[idx] if self.attention_mask else None,
            "input_ids": self.input_ids[idx] if self.input_ids else None,
#             "token_type_ids": self.token_type_ids[idx] if self.token_type_ids else None
        }
        return sample
    
    def tokenize(self, tokenizer):
        self.attention_mask = []
        self.input_ids = []
        self.token_type_ids = []

        for text in self.texts:
            token = tokenizer(text, padding="max_length", truncation=True)
            
            self.attention_mask.append(token['attention_mask'])
            self.input_ids.append(token['input_ids'])
#             self.token_type_ids.append(token['token_type_ids'])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
labels = []

labels_str = combined_data['label'].unique().tolist()
for i, d in combined_data.iterrows():
    lab = labels_str.index(d['label'])
    labels.append(lab)
    
print(len(labels))
labels[:10]

1705


[0, 0, 1, 0, 0, 0, 0, 1, 0, 0]

In [5]:
train_dataset = CustomTextDataset(
    [d['tweet_text'] for i, d in combined_data.iterrows() if d['tvt2'] == 'training'],
    [labels[i] for i, d in combined_data.iterrows() if d['tvt2'] == 'training'])
test_dataset = CustomTextDataset(
    [d['tweet_text'] for i, d in combined_data.iterrows() if d['tvt2'] == 'validation'],
    [labels[i] for i, d in combined_data.iterrows() if d['tvt2'] == 'validation'])
train_dataset[0]

{'text': 'breaking - a germanwings airbus a320 plane reportedly crashed in the region of digne (french alps) #flightradar24 - french tv #itele',
 'label': 0,
 'attention_mask': None,
 'input_ids': None}

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")

In [7]:
# inputs = tokenizer(["you're stuck in a timewrap from 2004 though", "summa lumma dumma lumma"], padding="max_length", truncation=True)
# for k,v in inputs.items():
#     print(k)

In [8]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset.tokenize(tokenizer)
test_dataset.tokenize(tokenizer)

In [9]:
print(len(train_dataset))
print(len(test_dataset))

1176
371


### Fine Tuning

In [10]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-cased",
                                                           output_hidden_states=False,
                                                           num_labels=2)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier

In [11]:
from transformers import TrainingArguments

epochs = 20
batch_size = 8
save_steps = (round((len(train_dataset)/batch_size) + 0.49)) * epochs
# save_steps = 1_000_000

training_args = TrainingArguments(
    output_dir=f"../../data/models/{finetuned_dirname}",
    num_train_epochs=epochs,
    save_steps=save_steps,
    logging_steps=300,
    learning_rate=1e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)

print(f"Save Steps : {save_steps}")

Save Steps : 2940


In [12]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [13]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [14]:
import time

start = time.time()

trainer.train()

print(f"Execution Time : {round(time.time() - start)} seconds")

***** Running training *****
  Num examples = 1176
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2940
  Number of trainable parameters = 65783042
  0%|          | 0/2940 [00:00<?, ?it/s]The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
 10%|█         | 300/2940 [01:03<08:57,  4.92it/s]

{'loss': 0.4536, 'learning_rate': 8.979591836734695e-06, 'epoch': 2.04}


 20%|██        | 600/2940 [02:04<08:17,  4.70it/s]

{'loss': 0.1821, 'learning_rate': 7.959183673469388e-06, 'epoch': 4.08}


 31%|███       | 900/2940 [03:08<07:18,  4.65it/s]

{'loss': 0.0773, 'learning_rate': 6.938775510204082e-06, 'epoch': 6.12}


 41%|████      | 1200/2940 [04:13<06:16,  4.62it/s]

{'loss': 0.0365, 'learning_rate': 5.918367346938776e-06, 'epoch': 8.16}


 51%|█████     | 1500/2940 [05:17<05:07,  4.68it/s]

{'loss': 0.0162, 'learning_rate': 4.897959183673469e-06, 'epoch': 10.2}


 61%|██████    | 1800/2940 [06:19<03:53,  4.88it/s]

{'loss': 0.0131, 'learning_rate': 3.877551020408164e-06, 'epoch': 12.24}


 71%|███████▏  | 2100/2940 [07:22<02:56,  4.76it/s]

{'loss': 0.0051, 'learning_rate': 2.8571428571428573e-06, 'epoch': 14.29}


 82%|████████▏ | 2401/2940 [08:26<01:55,  4.68it/s]

{'loss': 0.0095, 'learning_rate': 1.8367346938775512e-06, 'epoch': 16.33}


 92%|█████████▏| 2701/2940 [09:30<00:51,  4.65it/s]

{'loss': 0.0012, 'learning_rate': 8.163265306122449e-07, 'epoch': 18.37}


100%|██████████| 2940/2940 [10:21<00:00,  4.69it/s]Saving model checkpoint to ../../data/models/distilbert-base-finetuned-phemernr2-tf\checkpoint-2940
Configuration saved in ../../data/models/distilbert-base-finetuned-phemernr2-tf\checkpoint-2940\config.json
Model weights saved in ../../data/models/distilbert-base-finetuned-phemernr2-tf\checkpoint-2940\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 2940/2940 [10:22<00:00,  4.72it/s]

{'train_runtime': 622.971, 'train_samples_per_second': 37.755, 'train_steps_per_second': 4.719, 'train_loss': 0.0811973880646991, 'epoch': 20.0}
Execution Time : 623 seconds





In [15]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 371
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
100%|██████████| 47/47 [00:02<00:00, 18.38it/s]


{'eval_loss': 0.9550685286521912,
 'eval_accuracy': 0.876010781671159,
 'eval_runtime': 2.6282,
 'eval_samples_per_second': 141.161,
 'eval_steps_per_second': 17.883,
 'epoch': 20.0}