In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/contradictory-my-dear-watson/sample_submission.csv
/kaggle/input/contradictory-my-dear-watson/train.csv
/kaggle/input/contradictory-my-dear-watson/test.csv


In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")

train.head()

In [None]:
premise = train.loc[1, 'premise']
hypothesis = train.loc[1, 'hypothesis']
relationship = train.loc[1, 'label']

print('Premise:')
print(premise)
print('\nHypothesis:')
print(hypothesis)
print('\nRelationship:')
print(relationship)


In [None]:
longest_len = 0
idx = 0
for index, row in train.iterrows():
    premise_len = len(row['premise'].split())
    hypothesis_len = len(row['hypothesis'].split())
    length = 3 + premise_len + hypothesis_len
    if length > longest_len:
        longest_len = length
        idx = index

max_len = longest_len + 100

print('Longest Length:', longest_len)
print('Index:', idx)
print('Premise:')
print(train.loc[idx, 'premise'])
print(f"Length: {len(train.loc[idx, 'premise'].split())}")
print('Hypothesis:')
print(train.loc[idx, 'hypothesis'])
print(f"Length: {len(train.loc[idx, 'hypothesis'].split())}")


In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

def demo_encode():
    example = 'Hello world!'
    tokens = list(tokenizer.tokenize(example))
    tokens.append('[SEP]')
    print('Demo using:\n\"%s\"\n' % example)
    print('Tokens:\n', tokens)
    print('IDs:\n', tokenizer.convert_tokens_to_ids(tokens))

demo_encode()

In [None]:
print(encode_sentence("Hello World!"))

print("Encode longest:")
long_encode = encode_sentence(train.premise.values[idx])
print(encode_sentence(train.premise.values[idx]))
print(f"Length: {len(long_encode)}")

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    num_examples = len(hypotheses)
    print(f'Encoding {num_examples} pairs of hypotheses and premises as inputs...')
    
    sentence1 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(hypotheses)    
    ])
    sentence2 = tf.ragged.constant([
        encode_sentence(s)
        for s in np.array(premises)
    ])
    
    print(sentence1[0])
    
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    print(input_word_ids[0])
    
    input_mask = tf.ones_like(input_word_ids).to_tensor(shape=[num_examples,max_len])
    
    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor(shape=[num_examples,max_len])
    
    inputs = {
        'input_word_ids': input_word_ids.to_tensor(shape=[num_examples,max_len]),
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    }
    
    print('Finished')
    
    return inputs

In [None]:
train_input = bert_encode(train.premise.values, train.hypothesis.values, tokenizer)

In [None]:
print(train_input['input_word_ids'][0])
print(tokenizer.convert_ids_to_tokens(train_input['input_word_ids'][0]))
print(train_input['input_word_ids'][idx])

In [None]:
def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    
    input_word_ids = tf.keras.Input(
        shape=(None,),
        dtype=tf.int32,
        name="input_word_ids")
    input_mask = tf.keras.Input(
        shape=(None,),
        dtype=tf.int32,
        name="input_mask")
    input_type_ids = tf.keras.Input(
        shape=(None,),
        dtype=tf.int32,
        name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:, 0, :])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()
    
tf.keras.utils.plot_model(model, "three-input-bert-model.png", show_shapes=True)

In [None]:
model.fit(train_input, train.label.values, epochs=1, verbose=1, batch_size=64, validation_split=0.2)

In [None]:
test = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
test_input = bert_encode(test.premise.values, test.hypothesis.values, tokenizer)
test.head()

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
submission = test.id.copy().to_frame()
submission['prediction'] = predictions

submission.head()