In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
traindf = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
testdf = pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")
print("Number of rows and columns in train data : ",traindf.shape)
print("Number of rows and columns in test data : ",testdf.shape)

In [None]:
traindf.head()

In [None]:
traindf["lang_abv"].value_counts()

In [None]:
plt.figure(figsize=(10,5))
sns.countplot(x='label', data=traindf,
                   order=list(traindf['label'].value_counts().sort_index().index) ,
                   color='black')

Well Balanced on Label Distribution

In [None]:
print(f'Number of different Langauges: {len(traindf["language"].unique())}')
plt.figure(figsize=(20,5))
sns.countplot(x='language', data=traindf,
                   order=list(traindf['language'].value_counts().sort_index().index) ,
                   color='black')

In [None]:
plt.figure(figsize=(20,5))
sns.countplot(traindf['language'], hue = traindf['label'] ,
                   color='black')

Labels are also well distributed across multiple langauges

In [None]:
langdf=pd.DataFrame()
langdf['Name']=traindf.language.value_counts().index
langdf['Count']=traindf.language.value_counts().values

langdf_test=pd.DataFrame()
langdf_test['Name']=testdf.language.value_counts().index
langdf_test['Count']=testdf.language.value_counts().values

langdf['Key'] = 'train'
langdf_test['Key'] = 'test'
DF = pd.concat([langdf,langdf_test],keys=['train','test'])
DF.groupby(['Name','Key']).sum().unstack('Key').plot(kind='bar',figsize=(20, 5),color='black')

In [None]:
traindf["lang_abv"]= traindf["lang_abv"].replace("zh","zh-tw") 
testdf["lang_abv"]= testdf["lang_abv"].replace("zh","zh-tw") 

<h5>Using Google translate API for handling data from different Languages</h5>
For more Info please refer - https://stackabuse.com/text-translation-with-google-translate-api-in-python/

In [None]:
pip install googletrans

In [None]:
import googletrans
print(googletrans.LANGUAGES)

In [None]:
from googletrans import Translator
translator = Translator()
result = translator.translate('Main acha hoon', src='hi')
print(result.src)
print(result.dest)
print(result.origin)
print(result.text)
print(result.pronunciation)

In [None]:
def To_English(language,textstring):
    if language!="en":
        translator = Translator()
        return translator.translate(textstring,dest = "en").text
    else:
        return textstring

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() 

In [None]:
strategy

In [None]:
def Translate(x):
    translator = Translator()
    return translator.translate(x).text

In [None]:
#traindf.premise[traindf.lang_abv!= 'en']=traindf.premise[traindf.lang_abv!= 'en'].apply(lambda x: Translate(x))

In [None]:
#traindf.hypothesis[traindf.lang_abv!= 'en']=traindf.hypothesis[traindf.lang_abv!= 'en'].apply(lambda x: Translate(x))

In [None]:
#traindf.to_csv("traindf.csv",index=True)

In [None]:
#testdf.hypothesis[testdf.lang_abv!= 'en']=testdf.hypothesis[testdf.lang_abv!= 'en'].apply(lambda x: Translate(x))

In [None]:
#testdf.premise[testdf.lang_abv!= 'en']=testdf.premise[testdf.lang_abv!= 'en'].apply(lambda x: Translate(x))

In [None]:
#testdf.to_csv("testdf.csv",index=True)

Please find the translated data <a href="https://www.kaggle.com/krsna540/translated-data">here</a>

In [None]:
Updatedtraindf = pd.read_csv("../input/translated-data/traindf.csv")
Updatedtestdf = pd.read_csv("../input/translated-data/testdf.csv")

In [None]:
Updatedtraindf.head()

In [None]:
LangDf = pd.DataFrame()
LangDf['premise'] = Updatedtraindf['premise']
LangDf['hypothesis'] = Updatedtraindf['hypothesis']

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words=stopwords.words('english')

In [None]:
print(stop_words)

In [None]:
LangDf['premise'][0]

For a given sentence in the training set, we randomly choose and perform one of the following operations:
1. Synonym Replacement (SR): <br/>
Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random.<br/>
2. Random Insertion (RI): <br/>
Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times.<br/>
3. Random Swap (RS):<br/>
Randomly choose two words in the sentence and swap their positions. Do this n times.<br/>
4. Random Deletion (RD):<br/>
Randomly remove each word in the sentence with probability p

In [None]:
import random
from random import shuffle
random.seed(1)

# import these modules 
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import sent_tokenize, word_tokenize
#cleaning up text
import re
def Preprocess_text(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    
    #Removing stop words and convert words to base forms
    clean_line=LemmaSentence(clean_line)
    return clean_line

def LemmaSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    New_sentence=[]
    updated_word_list = list(set([word for word in token_words if word not in stop_words]))
    for word in token_words:
        lemmatizer = WordNetLemmatizer()
        New_sentence.append(lemmatizer.lemmatize(word))
        New_sentence.append(" ")
        
    return "".join(New_sentence)

In [None]:
########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet
#import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet 

def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            #print("replaced", random_word, "with", synonym)
            num_replaced += 1
        if num_replaced >= n: #only replace up to n words
            break

#this is stupid but we need it, trust me
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

#obviously, if there's only one word, don't delete it
    if len(words) == 1:
        return words

#randomly delete words with probability p
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)

#if you end up deleting all words, just return a random word
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]
    return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
    return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=7):
    words = sentence.split(' ')
    words = [word for word in words if word is not '']
    num_words = len(words)
    augmented_sentences = []
    num_new_per_technique = int(num_aug/4)+1
    n_sr = max(1, int(alpha_sr*num_words))
    n_ri = max(1, int(alpha_ri*num_words))
    n_rs = max(1, int(alpha_rs*num_words))

#sr
    for _ in range(num_new_per_technique):
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(' '.join(a_words))

#ri
    for _ in range(num_new_per_technique):
        a_words = random_insertion(words, n_ri)
        augmented_sentences.append(' '.join(a_words))

#rs
    for _ in range(num_new_per_technique):
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(' '.join(a_words))

    #rd
    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        augmented_sentences.append(' '.join(a_words))

    augmented_sentences = [sentence for sentence in augmented_sentences]
    shuffle(augmented_sentences)

    #trim so that we have the desired number of augmented sentences
    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    #append the original sentence
    augmented_sentences.append(sentence)
    return augmented_sentences

In [None]:
print(f"Actual text - {Preprocess_text(LangDf['premise'][1])}")
print("Augmented text -")
for x in eda(Preprocess_text(LangDf['premise'][1])):
    print(x)

In [None]:
Updatedtraindf=Updatedtraindf.drop(["id","language","lang_abv","Unnamed: 0"],axis=1)

In [None]:
print(Updatedtraindf.shape)
Updatedtraindf.head()

In [None]:
augmenteddf = pd.DataFrame(columns = ['premise', 'hypothesis', 'label']) 

In [None]:
rows=[]
def augment_data(premise,hypothesis,label):
     # Pass a series in append() to append a row in dataframe  
    rows.append([premise,hypothesis,label])
    for x in eda(premise):
        rows.append([x,hypothesis,label])
    for y in eda(hypothesis):
        rows.append([premise,y,label])    

In [None]:
for i in Updatedtraindf.index: 
    augment_data(Preprocess_text(Updatedtraindf['premise'][i]),Preprocess_text(Updatedtraindf['hypothesis'][i]), Updatedtraindf['label'][i])
    
augmenteddf=pd.DataFrame(rows, columns=['premise', 'hypothesis', 'label'])

In [None]:
print(Updatedtraindf.shape)
print(augmenteddf.shape)


In [None]:
from sklearn.utils import shuffle
augmenteddf = shuffle(augmenteddf)

In [None]:
augmenteddf.to_csv("AugmentedTrain.csv",index=False)

Please find the augmented data <a href="https://www.kaggle.com/krsna540/translated-data"> here </a>

In [None]:
def bert_encode(hypotheses, premises, tokenizer):
    
  num_examples = len(hypotheses)
  
  sentence1 = tf.ragged.constant([encode_sentence(s) for s in np.array(hypotheses)])
  sentence2 = tf.ragged.constant([encode_sentence(s) for s in np.array(premises)])

  cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
  input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

  input_mask = tf.ones_like(input_word_ids).to_tensor()

  type_cls = tf.zeros_like(cls)
  type_s1 = tf.zeros_like(sentence1)
  type_s2 = tf.ones_like(sentence2)
  input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

  inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

  return inputs


In [None]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)

In [None]:
os.environ["WANDB_API_KEY"] = "0" ## to silence warning

In [None]:
from transformers import BertTokenizer, TFBertModel
import matplotlib.pyplot as plt
import tensorflow as tf
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
train_input = bert_encode(augmenteddf.premise.values, augmenteddf.hypothesis.values, tokenizer)

In [None]:
max_len = 30

def build_model():
    bert_encoder = TFBertModel.from_pretrained(model_name)
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    input_type_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_type_ids")
    
    embedding = bert_encoder([input_word_ids, input_mask, input_type_ids])[0]
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
    
    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
with strategy.scope():
    model = build_model()
    model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping=EarlyStopping(monitor='val_accuracy',mode='max',patience=5,min_delta=0.01)
model.fit(train_input, augmenteddf.label.values, epochs = 10, verbose = 1, batch_size = 64, validation_split = 0.3,callbacks=[early_stopping])

In [None]:
test_input=bert_encode(Updatedtestdf.premise.values, Updatedtestdf.hypothesis.values, tokenizer)

In [None]:
predictions = [np.argmax(i) for i in model.predict(test_input)]

In [None]:
submission = Updatedtestdf.id.copy().to_frame()
submission['prediction'] = predictions

In [None]:
submission.head()

In [None]:
submission.to_csv('submission.csv', header=True, index=False) 

<center><H1> In Progress </h1></center>

<h5>References:</h5>

* https://arxiv.org/pdf/1901.11196.pdf
* https://github.com/jasonwei20/eda_nlp
* https://arxiv.org/abs/1706.03762
* https://github.com/google-research/bert
* https://openai.com/blog/better-language-models/
* https://arxiv.org/pdf/1906.08237.pdf
* https://blog.einstein.ai/introducing-a-conditional-transformer-language-model-for-controllable-generation/
* https://developer.nvidia.com/blog/training-bert-with-gpus/
* https://www.microsoft.com/en-us/research/blog/turing-nlg-a-17-billion-parameter-language-model-by-microsoft/