<a href="https://colab.research.google.com/github/ThierrryScotto/Finetunig-BERT/blob/main/Finetunig_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
!pip install -U -q transformers
!pip install -U -q accelerate
!pip install -U -q datasets
!pip install -U -q bertviz
!pip install -U -q umap-learn
!pip install -U -q evaluate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/324.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m317.4/324.4 kB[0m [31m66.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoModel

In [None]:
url_dataset = 'https://raw.githubusercontent.com/ThierrryScotto/Finetunig-BERT/main/twitter_multi_class_sentiment.csv'
df = pd.read_csv(url_dataset)
df

# Data Analysis

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['label'].value_counts()

In [None]:
label_counts = df['label_name'].value_counts(ascending=True)
label_counts.plot.barh()
plt.title('Frequency of Classes')
plt.ylabel('Emotions')

In [None]:
df['words per tweet'] = df['text'].str.split().apply(len)
df

In [None]:
df.boxplot("words per tweet", by='label_name')

# Data Preparation

### Tokenizing Test

In [None]:
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
text = "I love machne learning! Tokenization is awesome!"

encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokenizer.vocab

In [None]:
len(tokenizer.vocab)

## Data Loader and Train Test Split

In [None]:
df

In [None]:
train, test = train_test_split(df, test_size=0.3, stratify=df['label_name'])
test, validation = train_test_split(test, test_size=1/3, stratify=test['label_name'])

In [None]:
print(train.shape)
print(test.shape)
print(validation.shape)

In [None]:
dataset = DatasetDict(
    {
        'train': Dataset.from_pandas(train, preserve_index=False),
        'test': Dataset.from_pandas(test, preserve_index=False),
        'validation': Dataset.from_pandas(validation, preserve_index=False)
    }
)

dataset

## Tokenization of the emotions

In [None]:
dataset['train'][129]

In [None]:
def tokenize(batch):
    temp = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return temp

In [None]:
print(tokenize(dataset['train'][129]))

In [None]:
print(tokenize(dataset['train'][:2]))

In [None]:
dataset['train'][0], dataset['train'][1]

dataset with tokenized texts

In [None]:
emotion_encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
emotion_encoded['train'][0]

In [None]:
# label2id and id2label

label2id = { x['label_name'] : x['label'] for x in dataset['train'] }
id2label = { v : k for k, v in label2id.items() }

label2id, id2label

# Model Building

In [None]:
model= AutoModel.from_pretrained(model_ckpt)

In [None]:
model

In [None]:
model.config.architectures

In [None]:
model.config

## Fine-Tuning

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig

In [None]:
# number of emotions labels
num_labels = len(label2id)

# try to use GPU or CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Fine-tunig
config = AutoConfig.from_pretrained(model_ckpt, num_labels=num_labels, label2id=label2id, id2label=id2label)

# Applying fine-tuning and saving it in GPU or CPU
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

In [None]:
device

In [None]:
model

## Compute metrics

In [None]:
import evaluate

accuracy = evaluate.load('accuracy')

In [None]:
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## Training

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
batch_size = 64

training_dir = "bert_base_train_dir"

training_args = TrainingArguments(
    output_dir = training_dir,
    overwrite_output_dir = True,
    num_train_epochs = 2,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    disable_tqdm = False
)

In [None]:
tokenizer

In [None]:
trainer = Trainer(model = model,
                  args = training_args,
                  compute_metrics = compute_metrics,
                  train_dataset = emotion_encoded['train'],
                  eval_dataset = emotion_encoded['validation'],
                  tokenizer = tokenizer)

In [None]:
trainer.train()