# Fine-Tuning with DistilBERT

# Load the pre-trained model and its respective tokenizer 
To use different model, simple change the checkpoint to any pre-trained text classification model available in HuggingFace. It should be noted that some model can't be directly fine-tuned using transformers API. [A list of models can be found here](https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads)

In [1]:
from transformers import  AutoModelForSequenceClassification, AutoTokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english" # Define which pre-trained model we will be using
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) # Get the classifier
tokenizer = AutoTokenizer.from_pretrained(checkpoint) # Get the tokenizer

# Load the data and preprocess it 

In [2]:
import pandas as pd
# Load the training data
train_path = './nlp-getting-started/train.csv'
df = pd.read_csv('./nlp-getting-started/train.csv')

In [3]:
# Check the first few rows
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Check the data overall
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
# To make this simple we will drop id, keyword, location and only keep text and target
df = df.loc[:,["text", "target"]]

In [6]:
# Split the data into train and evaluation (stratified)
from sklearn.model_selection import train_test_split
df_train, df_eval = train_test_split(df, train_size=0.8,stratify=df.target, random_state=42) # Stratified splitting 

***

# Turn pandas dataframe into dataset
We will be using Trainer API from HuggingFace for fine-tuning, and it requires data in the form of Dataset. Therefore, we will convert our Pandas DataFrame into DataSet stored in DatasetDict

In [7]:
from datasets import Dataset, DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "eval": Dataset.from_pandas(df_eval)
})

In [8]:
# Check the datasets
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 6090
    })
    eval: Dataset({
        features: ['text', 'target', '__index_level_0__'],
        num_rows: 1523
    })
})


Train's features:
 {'text': Value(dtype='string', id=None), 'target': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}


First row of Train:
 {'text': 'Sassy city girl country hunk stranded in Smoky Mountain snowstorm #AoMS http://t.co/nkKcTttsD9 #ibooklove #bookboost', 'target': 1, '__index_level_0__': 6234}


***

# Tokenizing
Neural network require the input to be in the form of numbers for training to take place. Therefore, we will convert our text into vector of numbers (tokens) by using tokenizer.

In [9]:
# Tokenize the text, and truncate the text if it exceed the tokenizer maximum length. Batched=True to tokenize multiple texts at the same time.
tokenized_datasets = raw_datasets.map(lambda dataset: tokenizer(dataset['text'], truncation=True), batched=True)
print(tokenized_datasets)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'target', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    eval: Dataset({
        features: ['text', 'target', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})


In [10]:
# Check the first row
print(tokenized_datasets["train"][0])

{'text': 'Sassy city girl country hunk stranded in Smoky Mountain snowstorm #AoMS http://t.co/nkKcTttsD9 #ibooklove #bookboost', 'target': 1, '__index_level_0__': 6234, 'input_ids': [101, 21871, 6508, 2103, 2611, 2406, 15876, 8950, 15577, 1999, 20629, 3137, 4586, 19718, 1001, 20118, 5244, 8299, 1024, 1013, 1013, 1056, 1012, 2522, 1013, 25930, 2243, 6593, 4779, 16150, 2683, 1001, 21307, 14659, 14301, 2063, 1001, 2338, 5092, 14122, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


We want to remove text and \_\_index_level_0__ as they are not needed for our model fine-tuning. Also we will rename "target" to "labels", as Trainer API require the target to be named "labels"

In [11]:
tokenized_datasets = tokenized_datasets.remove_columns(["text", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("target", "labels")
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})


***

# Training
We will be using Trainer API from HuggingFace for training. 

In [12]:
!pip -q install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer
import numpy as np
import evaluate

# Padding for batch of data that will be fed into model for training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training args 
training_args = TrainingArguments("test-trainer", num_train_epochs=1, evaluation_strategy="epoch", 
                                  weight_decay=5e-4, save_strategy="no", report_to="none")

# Metric for validation error
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc") # F1 and Accuracy
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define trainer
trainer = Trainer(
    classifier,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
# Start the fine-tuning 
trainer.train()

  0%|          | 0/762 [00:00<?, ?it/s]

{'loss': 0.4859, 'grad_norm': 2.137977361679077, 'learning_rate': 1.7191601049868766e-05, 'epoch': 0.66}


  0%|          | 0/191 [00:00<?, ?it/s]

{'eval_loss': 0.3781295716762543, 'eval_accuracy': 0.8483256730137886, 'eval_f1': 0.8132578819725141, 'eval_runtime': 12.428, 'eval_samples_per_second': 122.546, 'eval_steps_per_second': 15.369, 'epoch': 1.0}
{'train_runtime': 162.9135, 'train_samples_per_second': 37.382, 'train_steps_per_second': 4.677, 'train_loss': 0.4633314878608924, 'epoch': 1.0}


TrainOutput(global_step=762, training_loss=0.4633314878608924, metrics={'train_runtime': 162.9135, 'train_samples_per_second': 37.382, 'train_steps_per_second': 4.677, 'total_flos': 79399199421768.0, 'train_loss': 0.4633314878608924, 'epoch': 1.0})

# Quick evaluation using classification metrics
We'll be using [sklearn.metrics.classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html) to do quick evaluation 

In [15]:
from sklearn.metrics import classification_report

# Make prediction on evaluation dataset
y_pred = trainer.predict(tokenized_datasets["eval"]).predictions
y_pred = np.argmax(y_pred, axis=-1)

# Get the true labels
y_true = tokenized_datasets["eval"]["labels"]
y_true = np.array(y_true)

# Print the classification report
print(classification_report(y_true, y_pred, digits=3))

  0%|          | 0/191 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0      0.839     0.908     0.872       869
           1      0.863     0.769     0.813       654

    accuracy                          0.848      1523
   macro avg      0.851     0.839     0.843      1523
weighted avg      0.849     0.848     0.847      1523



# Checking Outputs

In [16]:
# Get the test data
df_test = pd.read_csv("./nlp-getting-started/test.csv")
ids = df_test.id # Save ids
df_test = df_test.loc[:,["text"]] # Keep only text

# Turn the DataFrame into appropriate format
test_dataset = Dataset.from_pandas(df_test)
test_dataset = test_dataset.map(lambda dataset: tokenizer(dataset['text'], truncation=True), batched=True)
test_dataset = test_dataset.remove_columns('text')

# Get the prediction
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

# Turn submission into DataFrame and save into CSV files
submission = pd.DataFrame({"id":ids, "target":preds})
submission.to_csv("outputs.csv", index=False)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

  0%|          | 0/408 [00:00<?, ?it/s]