# Simple RoBERTa Training

Hello, this notebook is a simple baseline using HuggingFace and Keras that scores ~0.90 on LB in less than 150 lines of code and trying not to cheat. <br/>
Our work is inspired by many public notebooks so a big thank you to everyone that shared a notebook in this little competition. <br/>
We tried our best to avoid cheating but there may still be room for improvement. If you have any suggestions, please let us know in the comment section !

The co-authors of this work are:
* [Nicolas Vincent](https://www.kaggle.com/nicovincx2)
* [Adrien ArgentoAdrien Argento](https://www.kaggle.com/adrienargento)
* [Hela Siala](https://www.kaggle.com/helasiala)
* [Théo Boyer](https://www.kaggle.com/wolfy73)

In [None]:
"""
    Imports
"""

import numpy as np
import pandas as pd

from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.layers import Dense, Input, GlobalAveragePooling1D

import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow.keras.backend as K

In [None]:
"""
    Let's use the TPU if possible
"""

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
"""
    Parameters
"""

model_name = 'jplu/tf-xlm-roberta-large'
EPOCHS = 5
BATCH_SIZE = 64

In [None]:
"""
    A function that tries to avoid cheat
"""

def clean_data(train, val):
    """
        Return train without the examples that are in val
    """
    initial_ds = train
    train = train.copy()
    n = len(train)
    # We concatenate premise and hypothesis in a copy of both datasets
    train = pd.DataFrame((train['premise'].str.lower() + ' | ' + train['hypothesis'].str.lower()).rename("data"))
    val = pd.DataFrame((val['premise'].str.lower() + ' | ' + val['hypothesis'].str.lower()).rename("data"))
    # Save the index, we're gonna need it later
    train['index'] = train.index
    val['index'] = val.index
    # This is in essence equivalent to a left join in SQL. This alow us to find the indexes of pairs tha occurs in both datasets
    train_in_val_idxs = pd.merge(val, train, how='left', on='data')['index_y'].values
    train_in_val_idxs = train_in_val_idxs[train_in_val_idxs == train_in_val_idxs].astype(np.int)
    # Remove the examples present in both datasets
    train_clean = initial_ds.drop(train_in_val_idxs)
    train_clean = train_clean.reset_index().drop('index', axis=1)
    n_removed = n - len(train_clean)
    print("Cleaned ! removed {} examples ({:.4f}% of the total dataset)".format(n_removed, 100 * n_removed/n))
    return train_clean

In [None]:
"""
    Let's use a little class to manage our data
"""

class WatsonAugmentedDataset:
    def __init__(self):
        self.maxlen = None
        # Tokenizer from huggingface
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        ## The competition datasets
        # We are gonna validate on the training data of the competition
        self.validation_data = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
        self.test_data = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
        # XNLI: in the languages we need, but everything is balanced
        self.xnli = pd.read_csv("../input/watsoncsvda/xnli.csv")
        # Let's clean XNLI from the test and train data
        print("Cleaning XNLI from train examples")
        self.xnli = clean_data(self.xnli, self.validation_data)
        print("Cleaning XNLI from test examples")
        self.xnli = clean_data(self.xnli, self.test_data)
        
        ## Now we want to balance the languages in the dataset.
        # Count the number of examples in each language
        test_n = self.test_data['language'].value_counts().values
        xnli_n = self.xnli['language'].value_counts().values
        # Number of examples in english in XNLI
        n_en_xnli = self.xnli['language'].value_counts().values.max()
        # Number of examples in english in the test set
        n_en_test = test_n.max()
        # Number of examples that are not in english in XNLI
        n_others_xnli = np.sum(xnli_n) - n_en_xnli
        # fraction of the test set in english
        p_en_test = n_en_test / np.sum(test_n)
        # Calculate the number of english examples to add to XNLI to end up with a similare languages distribution than in the test set
        n_mnli = int(round(((p_en_test - 1) * n_en_xnli + p_en_test * n_others_xnli) / (1 - p_en_test)))

        # MNLI: All examples are in english
        self.mnli = pd.read_csv("../input/watsoncsvda/mnli.csv").dropna()
        # Let's clean MNLI
        print("Cleaning MNLI from train examples")
        self.mnli = clean_data(self.mnli, self.validation_data)
        print("Cleaning MNLI from test examples")
        self.mnli = clean_data(self.mnli, self.test_data)
        # We sample from MNLI the number of examples we need to balance our training set
        self.mnli = self.mnli.sample(n_mnli)
        # Concatenate everything
        self.train_data = pd.concat([self.mnli, self.xnli], ignore_index=True).sample(frac=1)
        # Save the label
        self.train_label = self.train_data.label.values
        self.val_label = self.validation_data.label.values
        # Tokenize the data
        self.test_data = self.df_to_token_tensor(self.test_data)
        maxlen_test = self.test_data['input_ids'].shape[-1]
        self.maxlen = maxlen_test
        self.validation_data = self.df_to_token_tensor(self.validation_data)
        maxlen_val = self.validation_data['input_ids'].shape[-1]
        self.train_data = self.df_to_token_tensor(self.train_data)
        maxlen_train = self.train_data['input_ids'].shape[-1]
        
        assert maxlen_val == maxlen_test == maxlen_train, "Inconsistent maxlen between val, train and test ({}, {}, {})".format(maxlen_val, maxlen_test, maxlen_train)
        
    def df_to_token_tensor(self, data):
        return self.tokenizer(
            data[["premise", "hypothesis"]].astype(str).values.tolist(),
            return_attention_mask=False,
            #return_token_type_ids=False,
            padding='longest' if self.maxlen is None else 'max_length', 
            truncation=True, 
            return_tensors='tf', 
            max_length=self.maxlen
        )
        

data = WatsonAugmentedDataset()

In [None]:
"""
    Build the model
"""

with strategy.scope(): # Using the predefined hardware
    # We load the pre-trained model from his name (how cool is that)
    bert_encoder = TFAutoModel.from_pretrained(model_name)
    # This is a symbolic tensor that represents out input data
    input_ids = tf.keras.Input(shape=(data.maxlen,), dtype=tf.int32, name="input_ids")
    # We feed the data to the pretrained model and get the high level features
    features = bert_encoder(input_ids)[0]
    # Pooling to reduce dimensions
    features = GlobalAveragePooling1D()(features)
    # Classification layer
    output = Dense(3, activation='softmax')(features)
    # Make the model
    model = tf.keras.Model(inputs=input_ids, outputs=output)
    # Compile it with a little learning rate
    model.compile(
        tf.keras.optimizers.Adam(lr=1e-5),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    model.summary()


In [None]:
"""
    And now we can train the model !
"""
# We want to save the model that gives us the best validation results
mcp_save = tf.keras.callbacks.ModelCheckpoint('best_model.hdf5', save_best_only=True, save_weights_only=True, monitor='val_loss', mode='min')
# And we train everything
model.fit(
    data.train_data.values(),
    data.train_label,
    epochs=EPOCHS,
    verbose=1,
    batch_size=BATCH_SIZE,
    callbacks=[mcp_save],
    validation_data=(data.validation_data.values(), data.val_label)
)

In [None]:
"""
    Now we can make the predictions on the test set for the submission
"""

predictions = [np.argmax(i) for i in model.predict(data.test_data.values())]
submission = pd.read_csv("../input/contradictory-my-dear-watson/sample_submission.csv")
submission['prediction'] = predictions
submission.to_csv("submission.csv", index = False)