AI - TP3_8

Bastien SAUVAT et Bastien FAISANT

# Exercise 1 : Explorer the transformers with Keras

In [1]:
import os
from collections import defaultdict
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


## Implement a Transformer block as a layer

In [2]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [
                layers.Dense(ff_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## Implement embedding layer

Two seperate embedding layers, one for tokens, one for token index (positions).

In [3]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

## Data parsing

In [4]:
def get_info(path: str):
    data = list(os.walk(path))[1:]
    files = []
    for d in data:
        folder_name = d[0]
        for file in d[2]:
            files.append((folder_name.split('/')[-1], os.path.join(folder_name, file)))

    d = defaultdict(int)
    texts = defaultdict(list)
    for (cate, file) in files:
        with open(file, 'r') as outfile:
            text = outfile.read()
            texts[cate].append(text)
            words = text_to_word_sequence(text)
            for word in words:
                d[word] += 1
    words = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return (texts, words)

In [5]:
training_texts, training_words = get_info("../TP1-2/data/ohsumed-first-20000-docs/training/")
test_texts, test_words = get_info("../TP1-2/data/ohsumed-first-20000-docs/test/")

In [6]:
def get_df(dataset: defaultdict[any, list]):
    classes = []
    texts = []
    for classe, liste_texts in dataset.items():
        for text in liste_texts:
            texts.append(text)
            classes.append(classe)

    df = pd.DataFrame({'Classes': classes, 'Texts': texts})
    return df


In [7]:
train_set = get_df(training_texts)
test_set = get_df(test_texts)

## Pre-processing

In [8]:
english_stops = set(stopwords.words('english'))

In [9]:
def convert_classes_to_integers(classes):
    unique_classes = classes.unique()
    class_mapping = {cls: int(cls[1:]) for cls in unique_classes}
    return classes.replace(class_mapping)

In [10]:
def load_dataset(texts: defaultdict[any, list]):
    stemmer = PorterStemmer()
    df = get_df(texts)

    x_data = df['Texts']
    y_data = df['Classes']

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    x_data = x_data.apply(lambda review: [stemmer.stem(w) for w in review]) # perform stemming
    

    # Replace class name by their number
    y_data = convert_classes_to_integers(y_data)

    return x_data, y_data

In [11]:
x_train, y_train = load_dataset(training_texts)
x_test, y_test = load_dataset(test_texts)

In [12]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [13]:
# ENCODE REVIEW
token = Tokenizer(lower=False)
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

## Create classifier model using transformer layer

Transformer layer outputs one vector for each time step of our input sequence.
Here, we take the mean across all time steps and
use a feed forward network on top of it to classify text.

In [73]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_length,))
embedding_layer = TokenAndPositionEmbedding(max_length, total_words, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(24, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

print(model.summary())

Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_22 (InputLayer)       [(None, 112)]             0         
                                                                 
 token_and_position_embeddin  (None, 112, 32)          641824    
 g_21 (TokenAndPositionEmbed                                     
 ding)                                                           
                                                                 
 transformer_block_21 (Trans  (None, 112, 32)          10656     
 formerBlock)                                                    
                                                                 
 global_average_pooling1d_20  (None, 32)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dropout_64 (Dropout)        (None, 32)                0  

## Train and Evaluate

In [70]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [71]:
history = model.fit(
    x_train, y_train, batch_size=64, epochs=5
)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [72]:
loss, accuracy = model.evaluate(x_test, y_test)
print('Loss: {}'.format(loss))
print('Accuracy: {}'.format(accuracy))

Loss: 1.9133960008621216
Accuracy: 0.40312573313713074


When comparing the Transformer-based model and the LSTM-based classifier, no significant improvement in accuracy was obtained, compared to different configurations of the Transformer model applied to the Ohsumed dataset. similar whether for the training set (between 0.44 and 0.51) and the test set (between 0.38 and 0.42)<br>

Upon systematic analysis of the impact of hyperparameter changes on the Transformer model, various adjustments like altering batch size, the number of attention heads, the embedding size and the dropout displayed marginal variations in performance.<br>
For instance, modifying batch sizes did not significantly affect the model's accuracy, which remained approximately around 0.42 for the test set.<br>
Additionally, manipulating the number of attention heads between 1 and 10 did not lead to substantial improvements in accuracy, staying within the range of 0.38 to 0.42. However, a negative point is that increasing the number of attentionheads increases the number of trainable parameters of the model and therefore the training time.<br>
Furthermore, if we increase the embedding size from 32 to 64, the number of trainable parameters increases drastically and therefore directly influences the training time of the model.

#### Transformer-based model performance
- With batch_size = 32 :
  - training set accuracy=0.47
  - test set accuracy=0.42
- With batch_size = 128
  - training set accuracy=0.48
  - test set accuracy=0.42
- Decrease attention heads to 1 :
  - training set accuracy=0.44
  - test set accuracy=0.38
- Increase attention heads à 3 :
  - training set accuracy=0.44
  - test set accuracy=0.39
- Increase attention heads to 4 :
  - training set accuracy=0.45
  - test set accuracy=0.38
- Increase attention heads to 10 :
  - training set accuracy=0.47
  - test set accuracy=0.39
- Increase dropout to 0.2 :
  - training set accuracy=0.50
  - test set accuracy=0.41
- Increase embedding size to 64 :
  - training set accuracy=0.50
  - test set accuracy=0.40
- Increase dropout to 0.4 :
  - training set accuracy=0.49
  - test set accuracy=0.40