In [1]:
#import neccessary modules
import numpy as np
import tensorflow as tf
import pandas as pd

from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from sklearn import metrics

tf.random.set_seed(1209)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Load train and test data into pandas df
train_df = np.load('train/data.npz')
test_df = np.load('test/test.npz')

train = pd.DataFrame([train_df['a'], train_df['b']]).T
train_data, train_vald_data = train_test_split(train, test_size = 0.2, shuffle = True, random_state = 1209)
train_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train_vald_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

test = pd.DataFrame([test_df['a'], test_df['b']]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

In [3]:
#Functions to convert words into bytes for Bert models
#Code provided by Steve
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.float64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


In [4]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [5]:
#Download the Bert models
model = TFBertForSequenceClassification.from_pretrained("google/bert_uncased_L-12_H-768_A-12 ", from_pt=True)
tokenizer = BertTokenizer.from_pretrained("google/bert_uncased_L-12_H-768_A-12 ")
model.summary()

OSError: google/bert_uncased_L-12_H-768_A-12  is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and pass `use_auth_token=True`.

In [None]:
#Create train_data and validation_data set
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, train_vald_data, DATA_COLUMN, LABEL_COLUMN)

training_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
training_data = training_data.shuffle(100, seed = 1209).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [None]:
#Compile and run model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(training_data, epochs=1, validation_data=validation_data)

In [None]:
#Create test sets
test_InputExamples, test_validation_InputExamples = convert_data_to_examples(test, train_vald_data, DATA_COLUMN, LABEL_COLUMN)

testing_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer)
testing_data = testing_data.batch(32)

In [None]:
#Predict results from testing data
pred_labels = model.predict(testing_data)
pred_results = np.argmax(pred_labels.logits, axis=1)
actual_lables = test_df['b'].astype(np.float)

In [None]:
#Get the scure and confusion matrix
confusion = metrics.confusion_matrix(actual_lables, pred_results)
acc = metrics.accuracy_score(actual_lables, pred_results)
precision = metrics.precision_score(actual_lables, pred_results)
recall = metrics.recall_score(actual_lables, pred_results)
F1 = metrics.f1_score(actual_lables, pred_results)
print("Accuracy score: " + str(acc))
print("Precision score: "+ str(precision))
print("Recall score: " + str(recall))
print("F1-score: " + str(F1))