<a href="https://colab.research.google.com/github/Rstam59/ds-portfolio/blob/main/Classification_with_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

In [None]:
from huggingface_hub import list_datasets

all_dataset = list(list_datasets())  # convert generator to list
print(f"Total number of datasets: {len(all_dataset)}")
print(f"First 10 datasets: {all_dataset[:10]}")


In [None]:
!pip install -U datasets

In [None]:
# !pip install -U fsspec


In [None]:
from datasets import load_dataset  # not load_datasets

emotions = load_dataset("emotion")
print(emotions)


In [None]:
train_ds = emotions['train']
train_ds

In [None]:
len(train_ds)

In [None]:
 train_ds.features

In [None]:
train_ds['text'][:5]

In [None]:
train_ds = emotions['train']
train_ds

In [None]:
len(train_ds)

In [None]:
train_ds[0]

In [None]:
train_ds.column_names

In [None]:
print(train_ds.features)

In [None]:
print(train_ds[:5])

In [None]:
print(train_ds['text'][:5])

#What if my dataset is not on the hub

In [None]:
dataset_url = "https://huggingface.co/datasets/transformersbook/emotion-train-split/raw/main/train.txt"
!wget {dataset_url}

In [None]:
!head -n 3 train.txt

In [None]:
emotions_local = load_dataset('csv', data_files='train.txt', sep = ';', names = ['text', 'label'])

In [None]:
emotions_local = load_dataset('csv', data_files = 'train.txt', sep = ';', names = ['text', 'label'])

In [None]:
# simpler
dataset_url = "https://huggingface.co/datasets/transformersbook/emotion-train-split/raw/main/train.txt"
emotions_remote = load_dataset("csv", data_files=dataset_url, sep=";",
                               names=["text", "label"])

#From datasets to DataFrames

In [None]:
import pandas as pd

emotions.set_format(type = 'pandas')

df = emotions['train'][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)

df['label_name'] = df['label'].apply(label_int2str)
df.head()

In [None]:
import matplotlib.pyplot as plt

df['label_name'].value_counts().plot.barh()
plt.show()

In [None]:
df['Words Per Tweet'] = df['text'].str.split().apply(len)
df.boxplot('Words Per Tweet', by = 'label_name', grid = False, showfliers = False, color = 'black')
plt.suptitle('')
plt.xlabel('')
plt.show()

In [None]:
emotions.reset_format()

In [None]:
import pandas as pd

emotions.set_format(type = 'pandas')
df = emotions['train'][:]
df.head()

In [None]:
def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)


df['label_name'] = df['label'].apply(label_int2str)
df.head()

In [None]:
import matplotlib.pyplot as plt

df['label_name'].value_counts().plot.barh()
plt.title('Frequency of classes')
plt.show()

#How long are our tweets

In [None]:
df['Words Per Tweet'] = df['text'].str.split().apply(len)
df.boxplot('Words Per Tweet', by = 'label_name', grid = False, showfliers = False, color = 'black')
plt.suptitle('')
plt.xlabel('')
plt.show()

In [None]:
emotions.reset_format()

#From text to tokens

#Character Tokenization

In [None]:
text = 'Tokenizing text is a core task of NLP.'
tokenized_text = list(text)
print(tokenized_text)

In [None]:
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)

In [None]:
token2idx = {ch: idx for idx, ch in enumerate(sorted(set(tokenized_text)))}
print(token2idx)

In [None]:
input_ids = [token2idx[token] for token in tokenized_text]
print(input_ids)

In [None]:
# import torch
# import torch.nn.functional as F

# input_ids = torch.tensor(input_ids)
# one_hot_encodings = F.one_hot(input_ids, num_classes=len(token2idx))
# one_hot_encodings.shape

In [None]:
import tensorflow as tf

input_ids = tf.constant(input_ids)
one_hot_encodings = tf.one_hot(input_ids, len(token2idx))
print(one_hot_encodings.shape)

In [None]:
print(f"Token: {tokenized_text[0]}")
print(f"Tensor index: {input_ids[0]}")
print(f"One-hot: {one_hot_encodings[0]}")

#Word tokenization

In [None]:
tokenized_text = text.split()
print(tokenized_text)

#Subword Tokenization

In [None]:
from transformers import AutoTokenizer

model_ckpt = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
#Same but more specific
from transformers import DistilBertTokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)

In [None]:
encoded_text = tokenizer(text)
print(encoded_text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text.input_ids)
print(tokens)

In [None]:
tokenizer.convert_tokens_to_string(tokens)

In [None]:
tokenizer.vocab_size

In [None]:
tokenizer.model_max_length

In [None]:
tokenizer.model_input_names

#Tokenizing the whole dataset

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding = True, truncation = True)

In [None]:
print(tokenize(emotions['train'][:2]))

In [None]:
emotions_encoded = emotions.map(tokenize, batched = True, batch_size = None)

#Transformers as feature extractors

In [None]:
from transformers import AutoModel
import torch

model_ckpt = 'distilbert-base-uncased'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
text = 'this is a test'
inputs = tokenizer(text, return_tensors = 'pt')
print(inputs)

In [None]:
inputs = {k: v.to(device) for k, v in inputs.items()}
with torch.no_grad():
    outputs = model(**inputs)

print(outputs.last_hidden_state.size())

In [None]:
outputs.last_hidden_state.size()

In [None]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

In [None]:
outputs.last_hidden_state[:, 0].size()

In [None]:
def extract_hidden_states(batch):
    inputs = {k: v.to(device) for k, v in batch.items()
                                if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {'hidden_states': last_hidden_state[:, 0].cpu().numpy()}

In [None]:
emotions_encoded.set_format('torch', columns = ['input_ids', 'attention_mask', 'label'])

In [None]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched = True)

In [None]:
emotions_hidden['train'][0]

In [None]:
import numpy as np

X_train = np.array(emotions_hidden['train']['hidden_states'])
X_valid = np.array(emotions_hidden['validation']['hidden_states'])
y_train = np.array(emotions_hidden['train']['label'])
y_valid = np.array(emotions_hidden['validation']['label'])

#Visualizing the training data

In [None]:
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler

X_scaled = MinMaxScaler().fit_transform(X_train)
mapper = UMAP(n_components = 2, metric = 'cosine').fit(X_scaled)

df_emb = pd.DataFrame(mapper.embedding_, columns = ['X', 'Y'])
df_emb['label'] = y_train
df_emb.head()

In [None]:
fig, ax = plt.subplots(2, 3, figsize = (7, 5))
ax = ax.flatten()
cmaps = ['Greys', 'Blues', 'Oranges', 'Reds', 'Purples', 'Greens']
labels = emotions['train'].features['label']
for i, (label, cmap) in enumerate(zip(labels.names, cmaps)):
    df_emb_sub = df_emb.query(f'label == {i}')
    ax[i].hexbin(df_emb_sub['X'], df_emb_sub['Y'], cmap = cmap,
                 gridsize = 20, linewidths = (0,))
    ax[i].set_title(label)
    ax[i].set_xticks([]), ax[i].set_yticks([])

plt.tight_layout()
plt.show()

#Training a simple classifier

In [None]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter = 3000)
lr_clf.fit(X_train, y_train)

In [None]:
lr_clf.score(X_valid, y_valid)

In [None]:
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy = 'most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

def plot_confusion_matrix(y_preds, y_true, labels):
    cm = confusion_matrix(y_true, y_preds, normalize = 'true')
    fig, ax = plt.subplots(figsize = (6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = labels)
    disp.plot(cmap = 'Blues', values_format = '.2f', ax = ax, colorbar = False)
    plt.title('Normalized confusion matrix')
    plt.show()

y_preds = lr_clf.predict(X_valid)
plot_confusion_matrix(y_preds, y_valid, labels.names)

# Fine-tuning Transformers

In [None]:
from transformers import AutoModelForSequenceClassification

model_ckpt = 'distilbert-base-uncased'
num_labels = 6
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                           num_labels = num_labels).to(device)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average = 'weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded['train']) // batch_size
model_name = f'{model_ckpt}-finetuned-emotion'
training_args = TrainingArguments(
    output_dir = model_name,
    num_train_epochs = 2,
    learning_rate = 2e-5,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    weight_decay = 0.01,
    eval_strategy = 'epoch',
    disable_tqdm = False,
    logging_steps = logging_steps,
    push_to_hub = True
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = emotions_encoded['train'],
    eval_dataset = emotions_encoded['validation'],
    tokenizer = tokenizer
)

trainer.train()

In [None]:
preds_output = trainer.predict(emotions_encoded['validation'])

In [None]:
preds_output.metrics

In [None]:
y_preds = np.argmax(preds_output.predictions, axis = 1)

In [None]:
plot_confusion_matrix(y_preds, y_valid, labels.names)

#Fine Tuning with Keras

In [None]:
from transformers import TFAutoModelForSequenceClassification

tf_model = (TFAutoModelForSequenceClassification
            .from_pretrained(model_ckpt, num_labels=num_labels))

In [None]:
tokenizer_columns = tokenizer.model_input_names

tf_train_dataset = emotions_encoded["train"].to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=True,
    batch_size=batch_size)
tf_eval_dataset = emotions_encoded["validation"].to_tf_dataset(
    columns=tokenizer_columns, label_cols=["label"], shuffle=False,
    batch_size=batch_size)

In [None]:
import tensorflow as tf

tf_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy())

tf_model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=2)

#Error analysis

In [None]:
from torch.nn.functional import cross_entropy

def forward_pass_with_labels(batch):
    inputs = {k:v.to(device) for k, v in batch.items()
                    if k in tokenizer.model_input_names}
    with torch.no_grad():
        output = model(**inputs)
        pred_label = torch.argmax(output.logits, axis = -1)
        loss = cross_entropy(output.logits, batch['label'].to(device),
                             reduction = 'none')

    return {'loss': loss.cpu().numpy(),
            'predicted_label': pred_label.cpu().numpy()}

In [None]:
emotions_encoded.set_format('torch',
                columns = ['input_ids', 'attention_mask', 'label'])

emotions_encoded['validation'] = emotions_encoded['validation'].map(
    forward_pass_with_labels, batched = True, batch_size = 16
)

In [None]:
emotions_encoded.set_format('pandas')
cols = ['text', 'label', 'predicted_label', 'loss']
df_test = emotions_encoded['validation'][:][cols]
df_test['label'] = df_test['label'].apply(label_int2str)
df_test['predicted_label'] = df_test['predicted_label'].apply(label_int2str)

In [None]:
df_test.sort_values('loss', ascending= True).iloc[1]['text']

In [None]:
df_test.sort_values('loss', ascending= True).head(10)

#Saving and Sharing model

In [None]:
trainer.push_to_hub(commit_message='Training completed!')

In [None]:
from transformers import pipeline

model_id = 'Rustam39/distilbert-base-uncased-finetuned-emotion'
classifier = pipeline('text-classification', model = model_id)

In [None]:
tweet = 'i was really scared'
preds = classifier(tweet, top_k = 6)
preds

In [None]:
preds_df = pd.DataFrame(preds)
preds_df_sorted = preds_df.sort_values('label', ascending = True)
plt.bar(labels.names, 100 * preds_df_sorted['score'])
plt.title(f'"{tweet}"')
plt.ylabel('Prediction probability (%)')
plt.show()