# DistilBERT Classifier

DistilBERT is a lighter transformer model of the original BERT model

## Data Processing

In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

In [None]:
df = pd.read_csv('blooms_combined_data.csv')

# removes all punctuations and change to lower case
df['Text'] = df['Text'].str.replace(r'[^\w\s]+', '', regex=True)
df['Text'] = df['Text'].str.lower()

# encode categories into numbers
# Analyse - 0
# Apply - 1
# Create - 2
# Evaluate - 3
# Remember - 4
# Understand - 5

df['Label'] = pd.factorize(df.Label)[0]

# Blooms taxonomy categories
categories = ['Analyse', 'Apply', 'Create', 'Evaluate', 'Remember', 'Understand']

In [None]:
df['Label'].value_counts()

In [None]:
# Split into Train and Validation data
train, val = train_test_split(df, test_size=0.2, random_state=0)

In [None]:
train['Label'].value_counts()

## NLPAug to augment text data for Oversampling (Optional)

For this model, this is **not used** as it generates too much noise

In [None]:
import nlpaug.augmenter.word as nlpaw
from tqdm import tqdm


def augment_sentence(sentence, aug):
    """""""""
    Constructs a new sentence via text augmentation.

    Input:
        - sentence:     A string of text
        - aug:          An augmentation object defined by the nlpaug library

    Output:
        - A string of text that been augmented
    """""""""
    return aug.augment(sentence)[0]
    


def augment_data(df, aug, target_count):
    """""""""
    Takes a pandas DataFrame and augments its text data to a target count
    
    Input:
        - df:            A pandas DataFrame
        - aug:           Augmentation object defined by the nlpaug library.
        - target_count:  Integer representing the number of times to augment text to match count
    Output:
        - df:            Copy of the same pandas DataFrame with augmented data 
                         appended to it and with rows randomly shuffled.
    """""""""
    for category in tqdm(df['Label'].unique()):
        # gets the existing data
        existing_text = df[df['Label']==category]

        # number of extra augmented data to be generated
        num_to_gen = target_count - len(existing_text)

        # do not need to generate more as target count has been obtained
        if num_to_gen <= 0:
            continue

        # randomly select required number of text from current sample
        data_to_aug = existing_text.sample(n=num_to_gen, replace=True)

        # augment the data
        data_to_aug['Text'] = data_to_aug['Text'].apply(augment_sentence, aug=aug)

        df = df.append(data_to_aug, ignore_index=True)
    
    # shuffle samples and return
    return df.sample(frac=1, random_state=0)

In [None]:
# Use wordnet to replace words with synonyms
aug = nlpaw.SynonymAug(aug_src='wordnet',aug_max=3)

# get label with the highest count from training set
max_count = max(list(train['Label'].value_counts()))

# augment all training datasets to max_count
# balanced__train = augment_data(train, aug, target_count=max_count)

# skip augmenting step
balanced__train = train

In [None]:
balanced__train['Label'].value_counts()

## Tokenizing data into BERT input format

In [None]:
train_texts = list(balanced__train['Text'])
train_labels = list(balanced__train['Label'])

val_texts = list(val['Text'])
val_labels = list(val['Label'])

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

# Model Building

In [None]:
# Model Params

LEARNING_RATE = 0.000001
BATCH_SIZE = 24
EPOCHS = 50

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)

model.compile(optimizer=optimizer, loss=model.hf_compute_loss, metrics=['accuracy'])

## Training the Model

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, verbose=1, restore_best_weights=True)

history = model.fit(train_dataset.batch(BATCH_SIZE), epochs=EPOCHS, batch_size=BATCH_SIZE,
          validation_data=val_dataset.batch(BATCH_SIZE), shuffle=True, callbacks=[callback])

# EPOCH 40: loss: 0.0773 - accuracy: 0.9820 - val_loss: 0.3031 - val_accuracy: 0.9109

## Model Evaluation

In [None]:
loss_train = history.history['accuracy']
loss_val = history.history['val_accuracy']
epochs = range(1, EPOCHS + 1)
plt.plot(epochs, loss_train, 'g', label='Training accuracy')
plt.plot(epochs, loss_val, 'b', label='validation accuracy')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
loss_train = history.history['loss']
loss_val = history.history['val_loss']
epochs = range(1,EPOCHS + 1)
plt.plot(epochs, loss_train, 'g', label='Training loss')
plt.plot(epochs, loss_val, 'b', label='validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

## Save Model

In [None]:
save_directory = "saved_models"

In [None]:
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

## Load Model

In [2]:
load_directory = "BloomBERT_model"

In [None]:
loaded_tokenizer = DistilBertTokenizer.from_pretrained(load_directory)
loaded_model = TFDistilBertForSequenceClassification.from_pretrained(load_directory)

## Testing Predictions

In [15]:
category_dict = {0: 'Analyse', 1: 'Apply', 2: 'Create', 3: 'Evaluate', 4: 'Remember', 5: 'Understand'}

In [16]:
def predict_blooms(text):
    predict_input = loaded_tokenizer.encode(text,
                                     truncation=True,
                                     padding=True,
                                     return_tensors="tf")

    output = loaded_model(predict_input)[0]

    prediction_value = tf.argmax(output, axis=1).numpy()[0]

    return category_dict[prediction_value]

In [18]:
test_text = "remember talking points of a meeting"

print(test_text)
print("Predicted Class:", predict_blooms(test_text))

remember talking points of a meeting
Predicted Class: Remember


## Model Report

In [None]:
test_report = pd.read_csv('sample_data.csv')

# removes all punctuations and change to lower case
test_report['Text'] = test_report['Text'].str.replace(r'[^\w\s]+', '', regex=True)
test_report['Text'] = test_report['Text'].str.lower()

# encode categories into numbers
# test_report['Label'] = pd.factorize(test_report.Label)[0]

In [None]:
test_report

In [None]:
from tqdm import tqdm
# for loading bar
tqdm.pandas()

# predict labels on validation
test_report['Predictions'] = test_report['Text'].progress_apply(predict_blooms)

In [None]:
test_report