Fine-tuning a small BERT model

**Imports**

In [1]:

import os
import pathlib
import pickle

import keras
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_text #Needed for the preprocessor; even though not directly called
import tensorflow_hub as tfHub


import shared_code_path
import standard_pipes as stdPipeHelp


2023-04-25 15:50:50.935603: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-25 15:50:50.936819: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-25 15:50:50.960265: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-25 15:50:50.961000: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


**Configuration Variables**

In [2]:
#
_rawDataFolder = os.path.abspath( os.path.join("..","..","..","raw_data") )
TRAIN_PATH = os.path.abspath( os.path.join(_rawDataFolder, "train.csv") )
TEST_PATH = os.path.abspath( os.path.join(_rawDataFolder, "test.csv") )

#
SAVE_FOLDER = os.path.abspath( "bert_small_finetune" )
SUBMISSIONS_PATH = os.path.join(SAVE_FOLDER, "submissions.csv")
MODEL_SAVE_PATH = os.path.join(SAVE_FOLDER, "checkpoint_model")
HISTORY_SAVE_PATH = os.path.join(SAVE_FOLDER,"history")

#BERT urls
#Note: We're using a "small BERT" model, which means slightly less accuracy than BERT base but....
URL_PREPROC = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
URL_BERT = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/2"

#
REFIT = True #refit the BERT model if set to true
BATCH_SIZE = 32
NUMB_EPOCHS = 3
INIT_LEARN_RATE = 3e-5


#
RANDOM_SEED = 523423

In [3]:
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
keras.utils.set_random_seed(RANDOM_SEED)

**Import the data**

In [4]:
RAW_TRAIN = pd.read_csv(TRAIN_PATH)

**Create save folder**

In [5]:
pathlib.Path(SAVE_FOLDER).mkdir(exist_ok=True)

**Import raw data**

In [6]:
RAW_TRAIN = pd.read_csv(TRAIN_PATH)
RAW_TRAIN.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


**Apply a standard text cleaning pipeline**

In [7]:
cleanPipe = stdPipeHelp.loadTextPreprocPipeA()
PROC_DATA = cleanPipe.fit_transform(RAW_TRAIN)

**Shuffle the frame**

In [8]:
PROC_DATA = PROC_DATA.sample(frac=1.0)

In [9]:
PROC_DATA.head(2)

Unnamed: 0,id,keyword,location,text,target
663,958,blaze,Mo.City,@beautiful_juic1 just letting you know,0
2086,2998,dead,,@atchisonsean he is dead,0


**Split into train/val data**

In [10]:
TRAIN_DATA = PROC_DATA.sample(frac=0.8)
VAL_DATA = PROC_DATA.drop(TRAIN_DATA.index)

**Load the BERT preprocessor**

In [11]:
_bertPreProc = tfHub.KerasLayer(URL_PREPROC)

2023-04-25 15:50:52.627308: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-04-25 15:50:52.627543: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-04-25 15:50:52.627601: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-04-25 15:50:52.627798: I tensorflow/core/

**Generate training/validation inputs and labels**

We will pass the processed text directly into the model, rather than making the preprocessing part of the actual model

In [12]:
TRAIN_INPUTS = _bertPreProc(TRAIN_DATA["text"].to_list())
VAL_INPUTS = _bertPreProc(VAL_DATA["text"].to_list())

In [13]:
TRAIN_LABELS = np.array( [ [x] for x in TRAIN_DATA["target"].to_list() ] )
VAL_LABELS = np.array( [[x] for x in VAL_DATA["target"].to_list() ] )

In [14]:
TRAIN_INPUTS

{'input_type_ids': <tf.Tensor: shape=(5580, 128), dtype=int32, numpy=
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>,
 'input_mask': <tf.Tensor: shape=(5580, 128), dtype=int32, numpy=
 array([[1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>,
 'input_word_ids': <tf.Tensor: shape=(5580, 128), dtype=int32, numpy=
 array([[ 101, 2161, 2093, ...,    0,    0,    0],
        [ 101, 2093, 5611, ...,    0,    0,    0],
        [ 101, 1001, 4911, ...,    0,    0,    0],
        ...,
        [ 101, 7929, 3521, ...,    0,    0,    0],
        [ 101, 1030, 4830, ...,    0,    0,    0],
        [ 101, 2324, 8945, ...,    0,    0,    0]], dtype=int32)>}

**Load a pretrained BERT model**

In [15]:
_BERT_ENCODER = tfHub.KerasLayer(URL_BERT, trainable=True, name="BERT_encoder")

**Add a single final layer so we get a single output (probability tweet is disaster in this case)**

Also a small dropout layer....because

In [16]:
#1)The input and encoder layer
_encoderInputs = {"input_type_ids": tf.keras.Input(shape=(128), dtype=tf.int32, name="input_type_ids"),
                  "input_mask": tf.keras.Input(shape=(128), dtype=tf.int32, name="input_mask"),
                  "input_word_ids": tf.keras.Input(shape=(128), dtype=tf.int32, name="input_word_ids")
                 }
_encOutputs = _BERT_ENCODER(_encoderInputs)

#2)The dropout and dense output layer
_netInput = _encOutputs["pooled_output"]
_dropoutLayer = keras.layers.Dropout(0.1)(_netInput)
_netOutput = tf.keras.layers.Dense(1, activation=None, name="classifier")(_dropoutLayer)

#3) The combination of the two
MODEL = keras.Model(_encoderInputs, _netOutput)

2023-04-25 15:50:54.761643: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-25 15:50:54.761687: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,128]
	 [[{{node inputs_2}}]]
2023-04-25 15:50:54.779287: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]

In [17]:
MODEL.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 BERT_encoder (KerasLayer)      {'pooled_output': (  28763649    ['input_mask[0][0]',             
                                None, 512),                       'input_type_ids[0][0]',     

Quickly check the model gives the expected size output from the expected input format (the actual output value means essentially nothing at this point)

In [18]:
_testInput = _bertPreProc(["HERE IS TEXT"])
MODEL(_testInput)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.05235735]], dtype=float32)>

In [19]:
_testInput

{'input_type_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
       dtype=int32)>,
 'input_mask': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
 array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 

**Setup optimizer/loss/metrics**

In [20]:
#Create a learning rate scheduler; so we start with a high rate then reduce it slowly
_nTrain = TRAIN_INPUTS["input_type_ids"].shape[0]
stepPerEpoch = BATCH_SIZE * _nTrain
_currArgs = [INIT_LEARN_RATE, stepPerEpoch*NUMB_EPOCHS]
_currKwargs = {"end_learning_rate":0.0}
learnRateScheduler =tf.keras.optimizers.schedules.PolynomialDecay(*_currArgs, **_currKwargs)

#Use this scheduler with the standard Adam optimizer
optimizer = keras.optimizers.Adam(learning_rate=learnRateScheduler)


In [21]:
loss = keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
MODEL.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

**Train the model**

In [22]:
_modelSaver = keras.callbacks.ModelCheckpoint(MODEL_SAVE_PATH, save_best_only=True)

In [23]:
%%prun
#27948313 function calls (27401180 primitive calls) in 422.583 seconds

if REFIT:
    history = MODEL.fit(x=TRAIN_INPUTS, y=TRAIN_LABELS, batch_size=BATCH_SIZE,
                        validation_data= (VAL_INPUTS, VAL_LABELS),
                        epochs=NUMB_EPOCHS,
                        callbacks=_modelSaver)
    
    with open(HISTORY_SAVE_PATH, "wb") as f:
        pickle.dump(history.history, f)
    loadedHistory = history.history
    
else:
    with open(HISTORY_SAVE_PATH, "rb") as f:
        loadedHistory = pickle.load(f)
        

#Want to load the best model regardless
MODEL = keras.models.load_model(MODEL_SAVE_PATH)

Epoch 1/3


2023-04-25 15:50:55.208834: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-25 15:50:55.208875: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,128]
	 [[{{node inputs_2}}]]
2023-04-25 15:50:56.499224: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/bert_encoder/StatefulPartitionedCall_grad/bert_encoder/StatefulPart

2023-04-25 15:50:56.793916: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_3' with dtype float and shape [?,?,512]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_3}}]]
2023-04-25 15:50:56.793966: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_4' with dtype float and shape [?,?,512]
	 [[{{node gradients/StatefulPartitionedCall_grad/StatefulPartitionedCall_4}}]]
2023-04-25 15:50:56.793987: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) 



2023-04-25 15:53:15.530815: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-25 15:53:15.530858: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,128]
	 [[{{node inputs_2}}]]
2023-04-25 15:53:15.556983: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_type_ids' with dtype int32 and shape [?,128]
	 [[{{node input_type_ids}

2023-04-25 15:53:16.000934: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-25 15:53:16.000986: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,128]
	 [[{{node inputs_2}}]]
2023-04-25 15:53:16.087223: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_type_ids' with dtype int32 and shape [?,128]
	 [[{{node input_type_ids}

INFO:tensorflow:Assets written to: /home/richard/Work/random/learning_ml/kaggle_comps/dis_tweets_test_gh/src/notebooks/models/bert_small_finetune/checkpoint_model/assets


INFO:tensorflow:Assets written to: /home/richard/Work/random/learning_ml/kaggle_comps/dis_tweets_test_gh/src/notebooks/models/bert_small_finetune/checkpoint_model/assets


Epoch 2/3

2023-04-25 15:55:38.413536: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_type_ids' with dtype int32 and shape [?,128]
	 [[{{node input_type_ids}}]]
2023-04-25 15:55:38.413582: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_word_ids' with dtype int32 and shape [?,128]
	 [[{{node input_word_ids}}]]
2023-04-25 15:55:38.458396: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{

2023-04-25 15:55:38.875454: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_type_ids' with dtype int32 and shape [?,128]
	 [[{{node input_type_ids}}]]
2023-04-25 15:55:38.875503: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'input_word_ids' with dtype int32 and shape [?,128]
	 [[{{node input_word_ids}}]]
2023-04-25 15:55:38.919316: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_input_type_ids' with dtype int32 and shape

INFO:tensorflow:Assets written to: /home/richard/Work/random/learning_ml/kaggle_comps/dis_tweets_test_gh/src/notebooks/models/bert_small_finetune/checkpoint_model/assets


INFO:tensorflow:Assets written to: /home/richard/Work/random/learning_ml/kaggle_comps/dis_tweets_test_gh/src/notebooks/models/bert_small_finetune/checkpoint_model/assets


Epoch 3/3
 

2023-04-25 15:58:03.743458: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-25 15:58:03.743502: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,128]
	 [[{{node inputs_2}}]]
2023-04-25 15:58:03.771592: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]

In [24]:
loadedModel = keras.models.load_model(MODEL_SAVE_PATH)

2023-04-25 15:58:05.682987: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-04-25 15:58:05.683035: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs_2' with dtype int32 and shape [?,128]
	 [[{{node inputs_2}}]]
2023-04-25 15:58:05.700602: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]

**Look at history; best val-loss should be what our loaded model corresponds to**

In [25]:
loadedHistory

{'loss': [0.4780053496360779, 0.3710975646972656, 0.29011771082878113],
 'binary_accuracy': [0.7763440608978271,
  0.8405017852783203,
  0.8808243870735168],
 'val_loss': [0.4241355359554291, 0.4124630391597748, 0.4797154664993286],
 'val_binary_accuracy': [0.823655903339386,
  0.8229390978813171,
  0.811469554901123]}

**Double check I can correctly predict the validation set**

In [26]:
def _getLabelsFromModel(modelInput):
    _logits = MODEL.predict(modelInput)
    labelProbabilities = tf.sigmoid( _logits ).numpy()
    outLabels = np.where(labelProbabilities>=0.5, 1, 0)
    return outLabels

In [27]:
predLabels = _getLabelsFromModel(VAL_INPUTS)
actLabels = VAL_LABELS



In [28]:
nWrong = np.sum( abs( VAL_LABELS - predLabels ) )
nTotal = len(VAL_LABELS)
binAcc = (nTotal-nWrong) / nTotal
print("I  calculate binAcc as {:.3f} for the validation set".format(binAcc))

I  calculate binAcc as 0.833 for the validation set


**Predict for the test set**

In [29]:
RAW_TEST = pd.read_csv(TEST_PATH)
RAW_TEST

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


In [30]:
testCleanPipe = stdPipeHelp.loadTextPreprocPipeA(removeDuplicateTweets=False)
testCleanPipe.fit(RAW_TRAIN)

#
PROC_TEST = testCleanPipe.transform(RAW_TEST)
PROC_TEST

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,"heard about #earthquake is different cities , ..."
2,3,,,"there is a forest fire at spot pond , geese ar..."
3,9,,,apocalypse lighting . #spokane #wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan
...,...,...,...,...
3258,10861,,,earthquake safety los angeles ûò safety faste...
3259,10865,,,storm in ri worse than last hurricane . my cit...
3260,10868,,,green line derailment in chicago
3261,10874,,,meg issues hazardous weather outlook (hwo)


In [31]:
rawTestInput = _bertPreProc(PROC_TEST["text"].to_list())
testLabels = _getLabelsFromModel(rawTestInput)



**Write the test set predictions to a file**

Kaggle Score - 0.81734

In [32]:
OUT_FRAME = pd.DataFrame(data={"id":PROC_TEST["id"], "target":[x[0] for x in testLabels]})
OUT_FRAME

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [33]:
OUT_FRAME.to_csv(SUBMISSIONS_PATH, index=False)