In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

# Check available input files
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Read the training data
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
train.head()

# Print premise, hypothesis, and relationship
print('Premise:')
print(train['premise'].iloc[1])
print('\nHypothesis:')
print(train['hypothesis'].iloc[1])
print('\nRelationship:')
print(train['label'].iloc[1])

# Find the longest length
longest_len = 0
idx = 0
for index, row in train.iterrows():
    premise_len = len(row['premise'].split())
    hypothesis_len = len(row['hypothesis'].split())
    length = 3 + premise_len + hypothesis_len
    if length > longest_len:
        longest_len = length
        idx = index

max_len = longest_len + 100

print('Longest Length:', longest_len)
print('Index:', idx)
print('Premise:')
print(train['premise'].iloc[idx])
print(f"Length: {len(train['premise'].iloc[idx].split())}")
print('Hypothesis:')
print(train['hypothesis'].iloc[idx])
print(f"Length: {len(train['hypothesis'].iloc[idx].split())}")

def bert_encode(hypotheses, premises):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    num_examples = len(hypotheses)
    print(f'Encoding {num_examples} pairs of hypotheses and premises as inputs...')

    sentence1 = [
        tokenizer.encode(s, add_special_tokens=True)
        for s in np.array(hypotheses)
    ]
    sentence2 = [
        tokenizer.encode(s, add_special_tokens=True)
        for s in np.array(premises)
    ]

    sentence1 = tf.keras.preprocessing.sequence.pad_sequences(sentence1, padding='post')
    sentence2 = tf.keras.preprocessing.sequence.pad_sequences(sentence2, padding='post')

    print(sentence1[0])

    cls = [tokenizer.cls_token_id] * sentence1.shape[0]
    input_word_ids = tf.concat([tf.expand_dims(cls, axis=-1), sentence1, sentence2], axis=-1)
    print(input_word_ids[0])

    input_mask = tf.ones_like(input_word_ids)

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([tf.expand_dims(type_cls, axis=-1), type_s1, type_s2], axis=-1)

    inputs = {
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    }

    print('Finished')

    return inputs

train_input = bert_encode(train['hypothesis'].values, train['premise'].values)
print(train_input['input_word_ids'][0])
print(train_input['input_word_ids'][idx])

def build_model():
    bert_encoder = TFBertModel.from_pretrained('bert-base-uncased')

    input_word_ids = tf.keras.Input(
        shape=(None,),
        dtype=tf.int32,
        name="input_word_ids")
    input_mask = tf.keras.Input(
        shape=(None,),
        dtype=tf.int32,
        name="input_mask")
    input_type_ids = tf.keras.Input(
        shape=(None,),
        dtype=tf.int32,
        name="input_type_ids")

    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:, 0, :])

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

model = build_model()
model.summary()

model.fit(train_input, train['label'].values, epochs=5, verbose=1, batch_size=16)

test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input = bert_encode(test['hypothesis'].values, test['premise'].values)
test.head()

predictions = [np.argmax(i) for i in model.predict(test_input)]
submission = test['id'].copy().to_frame()
submission['prediction'] = predictions
submission.head()


In [None]:
submission.to_csv("submission.csv", index=False)