In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "XLA_GPU:0"
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification, BertModel
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
import os
import shutil
from sklearn import metrics


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Load train and test data into pandas df
train_df = np.load('train/data.npz')
test_df = np.load('test/test.npz')

train = pd.DataFrame([train_df['a'], train_df['b']]).T
train_data, train_vald_data = train_test_split(train, test_size = 0.2, shuffle = True, random_state = 1209)
train_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
train_vald_data.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

test = pd.DataFrame([test_df['a'], test_df['b']]).T
test.columns = ['DATA_COLUMN', 'LABEL_COLUMN']

In [3]:
print(test)

                                             DATA_COLUMN LABEL_COLUMN
0      I found this movie really hard to sit through,...          1.0
1      The movie starts off with Reeve (Ekin) and his...          1.0
2      I had a VERY hard time sitting through this fi...          1.0
3      I'm not a big fan of musicals, but I was alway...          0.0
4      I honestly fail to understand why people love ...          1.0
...                                                  ...          ...
24995  I was surprised that I liked this movie. But i...          0.0
24996  I saw this movie in 1956 and again on Cable a ...          1.0
24997  This is a movie that was probably made to ente...          1.0
24998  Okay, make no mistake - this is a pretty awful...          1.0
24999  UK-born Australian helmer Alex Frayne calls fo...          0.0

[25000 rows x 2 columns]


In [4]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train_data.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = train_vald_data.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples
  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.float64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


In [5]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, 
                                                                           train_vald_data, 
                                                                           DATA_COLUMN, 
                                                                           LABEL_COLUMN)

In [6]:
model = TFBertForSequenceClassification.from_pretrained("google/bert_uncased_L-4_H-256_A-4", from_pt=True)
tokenizer = BertTokenizer.from_pretrained("google/bert_uncased_L-4_H-256_A-4")

Downloading: 100%|██████████| 383/383 [00:00<00:00, 383kB/s]
Downloading: 100%|██████████| 43.0M/43.0M [00:18<00:00, 2.44MB/s]
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 1.22MB/s]


In [7]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  11170560  
_________________________________________________________________
dropout_13 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  514       
Total params: 11,171,074
Trainable params: 11,171,074
Non-trainable params: 0
_________________________________________________________________


In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train_data, train_vald_data, DATA_COLUMN, LABEL_COLUMN)

training_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
training_data = training_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(training_data, epochs=1, validation_data=validation_data)