The model used in this projects is originaly from https://keras.io/examples/nlp/semantic_similarity_with_bert/#train-the-entire-model-endtoend 

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
import glob
from sklearn.utils import shuffle
import spacy
from sklearn.model_selection import train_test_split
import re
import json
import matplotlib.pyplot as plt

In [2]:
max_text_length = 100
batch_size = 128
epochs = 10

labels = [1, 0]

In [6]:

train_data_no = pd.read_csv('train_data_no.csv')
val_data_no = pd.read_csv('val_data_no.csv')
test_data_no = pd.read_csv('test_data_no.csv')

train_data_alpha = pd.read_csv('train_data_alpha.csv')
val_data_alpha = pd.read_csv('val_data_alpha.csv')
test_data_alpha = pd.read_csv('test_data_alpha.csv')

train_data_lemma = pd.read_csv('train_data_lemma.csv')
val_data_lemma = pd.read_csv('val_data_lemma.csv')
test_data_lemma = pd.read_csv('test_data_lemma.csv')

In [3]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
   
    def __init__(
        self,
        review_text,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
        albert=False
    ):
        self.review_text = review_text
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use bert-base-uncased pretrained model.
        if not albert:
            self.tokenizer = transformers.BertTokenizer.from_pretrained(
                "bert-base-uncased", do_lower_case=True
            )
        else:
            self.tokenizer = transformers.AlbertTokenizer.from_pretrained(
            "albert-base-v2", do_lower_case=True
        )
        self.indexes = np.arange(len(self.review_text))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.review_text) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        review_text = self.review_text[indexes]

        encoded = self.tokenizer.batch_encode_plus(
            review_text.tolist(),
            add_special_tokens=True,
            max_length=max_text_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
            truncation=True
        )

        #encoded = self.tokenizer.batch_encode(review_text.tolist())

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            #print(self.labels)
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [8]:
def get_LSTM_layer(input, units):
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(units, return_sequences=True)
    )(input)

    # Applying hybrid pooling approach to bi_lstm output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
    dropout = tf.keras.layers.Dropout(0.3)(concat)
    output = tf.keras.layers.Dense(2, activation="softmax")(dropout)

    return output

def get_BERT_LSTM_model(lstm_size, albert = False):

    strategy = tf.distribute.MirroredStrategy()

    with strategy.scope():
        input_ids = tf.keras.layers.Input(
            shape=(max_text_length,), dtype=tf.int32, name="input_ids"
        )

        # Attention masks indicates to the model which tokens should be attended to.
        attention_masks = tf.keras.layers.Input(
            shape=(max_text_length,), dtype=tf.int32, name="attention_masks"
        )

        # Token type ids are binary masks identifying different sequences in the model.
        token_type_ids = tf.keras.layers.Input(
            shape=(max_text_length,), dtype=tf.int32, name="token_type_ids"
        )

        # Loading pretrained BERT or ALBERT model.
        if not albert:
            bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
        else:
            bert_model = transformers.TFAlbertModel.from_pretrained("albert-base-v2")

        # Only use the pretrained weights from BERT
        #bert_model.trainable = False

        bert_output = bert_model(
            input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
        )[0]

        # Add a trainable bidirectional LSTM layer and Dense with softmax
        LSTM_layer = get_LSTM_layer(bert_output, lstm_size)

        model = tf.keras.models.Model(
            inputs=[input_ids, attention_masks, token_type_ids], outputs=LSTM_layer
        )
        model.load_weights('checkpoints/albert_64_alpha_final_tests2')
        model.compile(
            optimizer=tf.keras.optimizers.Adam(),
            loss="categorical_crossentropy",
            metrics=["acc"],
        )
        

    #model.load_weights('checkpoints/albert_128_alpha_final')

    return model, bert_model



In [4]:

def fit_model(lstm_size, train_data, val_data, save_file, albert=False):
    y_train = tf.keras.utils.to_categorical(train_data["label"].values, num_classes=2)
    y_val = tf.keras.utils.to_categorical(val_data["label"].values, num_classes=2)

    training_data = BertSemanticDataGenerator(
        train_data["text"].values.astype("str"),
        y_train,
        batch_size=batch_size,
        shuffle=True,
        albert=albert
    )
    validation_data = BertSemanticDataGenerator(
        val_data["text"].values.astype("str"),
        y_val,
        batch_size=batch_size,
        shuffle=False,
        albert=albert
    )

    model, bert_model = get_BERT_LSTM_model(lstm_size, albert)
    histor = model.fit(
        training_data,
        validation_data=validation_data,
        epochs=3,
        use_multiprocessing=True,
        workers=-1,
    )

    df_history = pd.DataFrame(histor.history)
    df_history.to_csv('history/'+save_file)

    return model, bert_model


In [None]:
model, bert_model = fit_model(64, train_data_no, val_data_no, "bert_64_no.csv")
#model, bert_model = fit_model(128, train_data_no, val_data_no, "albert_128_no.csv", albert=True)
#model, bert_model = fit_model(32, train_data_no, val_data_no, "albert_32_no.csv", albert=True)
#model, bert_model = fit_model(128, train_data_no, val_data_no, "bert_128_no.csv")
#model, bert_model = fit_model(32, train_data_no, val_data_no, "bert_32_no.csv")
#model, bert_model = fit_model(64, train_data_alpha, val_data_alpha, "albert_64_alpha.csv", albert=True)
#model, bert_model = fit_model(64, train_data_lemma, val_data_lemma, "albert_64_lemma.csv", albert=True)
#model, bert_model = fit_model(64, train_data_no, val_data_no, "albert_64_no.csv", albert=True)
#model, bert_model = fit_model(64, train_data_alpha, val_data_alpha, "bert_64_alpha.csv")
#model, bert_model = fit_model(64, train_data_lemma, val_data_lemma, "bert_64_lemma.csv")

#model, bert_model = fit_model(32, train_data_no, val_data_no, "bert_32_lemma.csv")
#model, bert_model = fit_model(32, train_data_alpha, val_data_alpha, "bert_32_alpha.csv")
#model, bert_model = fit_model(128, train_data_alpha, val_data_alpha, "bert_128_alpha.csv")
#model, bert_model = fit_model(128, train_data_lemma, val_data_lemma, "bert_128_lemma.csv")
#model, bert_model = fit_model(32, train_data_alpha, val_data_alpha, "albert_32_alpha.csv", albert=True)
#model, bert_model = fit_model(128, train_data_alpha, val_data_alpha, "albert_128_alpha.csv", albert=True)
#model, bert_model = fit_model(32, train_data_lemma, val_data_lemma, "albert_32_lemma.csv", albert=True)
#model, bert_model = fit_model(128, train_data_lemma, val_data_lemma, "albert_128_lemma.csv", albert=True)

In [42]:
y_train = tf.keras.utils.to_categorical(train_data_alpha["label"].values, num_classes=2)
y_val = tf.keras.utils.to_categorical(val_data_alpha["label"].values, num_classes=2)

training_data2 = BertSemanticDataGenerator(
    train_data_alpha["text"].values.astype("str"),
    y_train,
    batch_size=16,
    shuffle=True,
    albert=False
)
validation_data2 = BertSemanticDataGenerator(
    val_data_alpha["text"].values.astype("str"),
    y_val,
    batch_size=16,
    shuffle=False,
    albert=False
)


bert_model.trainable = True
model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
    )
model.summary()

history2 = model.fit(
    training_data2,
    validation_data=validation_data2,
    epochs=1
)

#df_history = pd.DataFrame(history2.history)
#df_history.to_csv('history/albert_64_alpha_final_finetuning_test.csv')
#model.save_weights('checkpoints/albert_64_alpha_final_finetuning_test')

Model: "model_8"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
attention_masks (InputLayer)    [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_albert_model_8 (TFAlbertMode ((None, 100, 768), ( 11683584    input_ids[0][0]                  
____________________________________________________________________________________________

KeyboardInterrupt: 

In [76]:
from sklearn.metrics import classification_report

test_data = pd.read_csv('test_data_alpha.csv')
labels = np.asarray([0, 1])
pred_batch_size = 256

preds = []
while len(preds) < len(test_data):
    i = len(preds)
    if i + pred_batch_size < len(test_data):
        review = np.array([str(text) for text in test_data["text"][i:i+pred_batch_size]])  
    else:
        review = np.array([str(text) for text in test_data["text"][i:len(test_data)]])

    print(len(review))
    test_review = BertSemanticDataGenerator(
        review, 
        labels=None, 
        batch_size=len(review), 
        shuffle=False, 
        include_targets=False, 
        albert=True
    )
    proba = model.predict(test_review)
    indexes = np.array(np.argmax(proba, 1), dtype="int32")
    preds += labels[indexes].tolist()
    print(i, ": ", labels[indexes])


print(classification_report(test_data['label'], preds))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96      3772
           1       0.95      0.98      0.96      3728

    accuracy                           0.96      7500
   macro avg       0.96      0.96      0.96      7500
weighted avg       0.96      0.96      0.96      7500



In [28]:
test_data = pd.read_csv('test_data_alpha.csv')

y_val = tf.keras.utils.to_categorical(test_data["label"].values, num_classes=2)
test_review = BertSemanticDataGenerator(
        test_data["text"].values.astype("str"),
        y_val,
        batch_size=128,
        shuffle=False,
        albert=True
    )
proba = model.evaluate(test_review)

  ...
    to  
  ['...']
