In [None]:
import tensorflow as tf
import numpy as np
import random
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import pandas as pd
from transformers import TFBertModel
import os
import tensorflow_addons as tfa
import keras_tuner as kt

In [None]:
%cd "C:\Users\siddh\Downloads\train_watson.csv"
train_df = pd.read_csv("train.csv")a
train_df

In [None]:
%cd "C:\Users\siddh\Downloads\test.csv~"
test_df = pd.read_csv("test.csv")
test_df

In [None]:
train_df.label.value_counts()

In [None]:
classes = ["entailment","neutral","contradiction"]

In [None]:
random_index = random.randint(0,len(train_df)-5)    # create random index not more than 
for row in train_df[["premise","hypothesis","label"]][random_index:random_index+5].itertuples():
    _, premise, hypothesis, label = row    # _ is to get rid of index
    
    if label == 0: print(f"label: {label}", "{entailment}")
    elif label == 1: print(f"label: {label}", "{neutral}")
    elif label == 2: print(f"label: {label}", "{contradictory}")
    print(f"-> Premise: {premise}\n-> Hypothesis: {hypothesis}\n")
    print("-"*30,"\n")

In [None]:
train_data, test_data, train_labels, test_labels = train_test_split(train_df[["premise","hypothesis"]].to_numpy(),
                                                                    train_df["label"].to_numpy(),
                                                                    test_size = 0.1,
                                                                    random_state = 42)

In [None]:
train_data[0], train_labels[0]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
# Testing the tokenize function

print('Original sentence: ' + train_df.premise[0] + '\n')
print('Tokenized sentence: ' + str(encode_sentence(train_df.premise[0])))

In [None]:
# Encode data for the bert model with a max length of 100

def bert_encode(hypotheses, premises, tokenizer, max_length=100):

    x = [h + ' [SEP] ' + p for h, p in zip(np.array(hypotheses), np.array(premises))]
    x = tokenizer(x, padding=True, truncation=True, max_length=max_length)

    inputs = {
          'input_word_ids':tf.ragged.constant(x['input_ids']).to_tensor(),
          'input_mask': tf.ragged.constant(x['attention_mask']).to_tensor(),
          'input_type_ids': tf.ragged.constant(x['token_type_ids']).to_tensor()}

    return inputs

In [None]:
train_input = bert_encode(train_df.premise.values, train_df.hypothesis.values, tokenizer)

train_input

In [None]:
test_input = bert_encode(test_df.premise.values, test_df.hypothesis.values, tokenizer)

test_input

In [None]:
# Enable TPU for faster training

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
#from official.nlp import optimization
os.environ["WANDB_API_KEY"] = "0" # to silence warning...sometimes :D

max_len = 100
bert_encoder = TFBertModel.from_pretrained('bert-base-multilingual-cased')

In [None]:
def build_model(hp,base_model=bert_encoder):
    '''
    Keras tunes model using bert_encoder(base_multilingual_cased) and finding out the optimal hyperparameters
    '''
    base_model.trainable = False
    
# INPUTS
    input_word_ids = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,),dtype=tf.int32, name="input_type_ids")
    
    x = base_model([input_word_ids, input_mask, input_type_ids])[0]
    x = tf.keras.layers.LSTM(units=hp.Int('lstm_', min_value=50, max_value=100), return_sequences=True)(x)
    for i in range(hp.Int('num_dense_layers', 1, 5)):
        x = tf.keras.layers.Dropout(hp.Choice('dropout_', values=[0.0, 0.1, 0.2]))(x)
        x = tf.keras.layers.Dense(units=hp.Int('dense_', min_value=50, max_value=100), activation='relu')(x)

# OUTPUT
    output = tf.keras.layers.Dense(3, activation='softmax')(x[:,0,:])

# BUILD THE FRAMEWORK
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)

# OPTIMIZER
    step = tf.Variable(0, trainable=True)
    schedule = tf.optimizers.schedules.PiecewiseConstantDecay(
    [5000, 7500,10000,12500], [1e-0, 5e-1, 1e-1, 5e-2,1e-2])
    lr = 1e-3 * schedule(step)
    wd = lambda: 1e-4 * schedule(step)

    optimizer = tfa.optimizers.AdamW(learning_rate=lr, weight_decay=wd)

# COMPILE
    model.compile(optimizer=optimizer, 
                  loss= tf.keras.losses.SparseCategoricalCrossentropy(), 
                  metrics=["accuracy",f1])
    
    return model

In [None]:
# train BERT model

tuner = kt.tuners.BayesianOptimization(build_model,
                                        seed=42,
                                        objective='val_loss',
                                        max_trials=5,
                                        directory='.',
                                        project_name = "My_dear_Watson1")

tuner.search(train_input,
            train_df.label.values,
            epochs = 12,
            verbose = 1,
            batch_size = 64,
            validation_split = 0.25)

In [None]:
best_model = tuner.get_best_models(1)[0]
best_model.summary()

In [None]:
for layer_number, layer in enumerate(best_model.layers):
    print(f"{layer_number}  {layer.name} : {layer.trainable}")

In [None]:
best_hyperparameters = tuner.get_best_hyperparameters(1)[0]
best_hyperparameters

In [None]:
best_model.fit(train_input,
            train_df.label.values,
            epochs = 15,
            verbose = 1,
            batch_size = 64,
            validation_split = 0.25)

In [None]:
def pred_class(sample_premise,sample_hypothesis,tokenizer):
    '''
    Predicts classes for the sample input
    '''
    
    global classes
    sample_input = bert_encode(sample_premise,sample_hypothesis,tokenizer)
    pred = model.predict(sample_input)
    return classes[tf.argmax(pred,axis=1)]