#### Preprocessing for Models

In [1]:
import os
import pandas
import re

def create_directory(subfolder):
    outdir = f"Datasets/{subfolder}"
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    return outdir


def create_csv_file_for_models(language_selection="en", dataset_split="train"):
    dataset = pandas.read_csv(f"Datasets/Source/{dataset_split}.csv")

    # Lower case
    dataset["premise"] = dataset["premise"].apply(lambda x: x.lower())

    dataset["hypothesis"] = dataset["hypothesis"].apply(lambda x: x.lower())

    # Clear out punctuation and special characters
    spec_char_regex = r"[.!?,:;\"{}\[\]\(\)\~<>|\\/+*/@#$%^]+"
    dataset = dataset.replace(to_replace=spec_char_regex, value="", regex=True)

    # Clear out numbers
    numbers_regex = r"[0-9]+"
    dataset = dataset.replace(to_replace=numbers_regex, value="", regex=True)

    joined_premise_values = " ".join(dataset["premise"])
    joined_hypothesis_values = " ".join(dataset["hypothesis"])
    all_words = joined_premise_values + joined_hypothesis_values
    word_list = all_words.split(" ")
    word_series = pandas.Series(word_list, dtype=str)
    word_series = word_series[word_series != ""]
    unique_value_counts = word_series.str.lower().str.strip().value_counts(ascending=False)
    target_words = unique_value_counts[(unique_value_counts > 1000)]
    target_words = target_words.index[target_words.index.str.len() > 1]
    target_words = "\\b" + target_words + "\\b"
    target_word_list = target_words.to_numpy().tolist()
    stop_words_regex_expr = "|".join(target_word_list)
    stop_words_regex = re.compile(stop_words_regex_expr)
    dataset["premise"] = dataset["premise"].replace(to_replace=stop_words_regex, value="", regex=True)

    dataset["hypothesis"] = dataset["hypothesis"].replace(to_replace=stop_words_regex, value="", regex=True)

    if language_selection.__eq__("en"):
        dataset = dataset[dataset.lang_abv == "en"]

    if dataset_split.__eq__("train"):
        dataset["label_name"] = "neutral"
        dataset.loc[dataset['label'] < 1, 'label_name'] = "entailment"
        dataset.loc[dataset['label'] > 1, 'label_name'] = "contradiction"

    outname = f"{dataset_split}_{language_selection}.csv"

    if dataset_split.__eq__("train"):
        bert_sample = dataset.sample(frac=0.5, random_state=42).reset_index(drop=True)
        electra_sample = dataset[~dataset.id.isin(bert_sample.id)].reset_index(drop=True)

        bert_sample.to_csv(os.path.join(create_directory("BERT"), outname))

        electra_sample.to_csv(os.path.join(create_directory("ELECTRA"), outname))
    else:
        dataset.to_csv(os.path.join(create_directory("Test"), outname))

In [None]:
create_csv_file_for_models()

create_csv_file_for_models("all_languages")

create_csv_file_for_models(dataset_split="test")

create_csv_file_for_models(language_selection="all_languages", dataset_split="test")

#### Testing for Preprocessing

In [87]:
dataset = pandas.read_csv(f"Datasets/Source/train.csv")
dataset.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [88]:
spec_char_regex = r"[.!?,:;\"{}\[\]\(\)\~<>|\\/+*/@#$%^]+"
dataset["premise"] = dataset["premise"].replace(to_replace=spec_char_regex, value="", regex=True)

dataset["hypothesis"] = dataset["hypothesis"].replace(to_replace=spec_char_regex, value="", regex=True)

numbers_regex = r"[0-9]+"
dataset["premise"] = dataset["premise"].replace(to_replace=numbers_regex, value="", regex=True)

dataset["hypothesis"] = dataset["hypothesis"].replace(to_replace=numbers_regex, value="", regex=True)

dataset["premise"] = dataset["premise"].apply(lambda x: x.lower())

dataset["hypothesis"] = dataset["hypothesis"].apply(lambda x: x.lower())

dataset.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,the rules developed in the interim were put to...,en,English,0
1,5b72532a0b,these are issues that we wrestle with in pract...,practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,des petites choses comme celles-là font une di...,j'essayais d'accomplir quelque chose,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,they can't defend themselves because of their age,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [89]:
joined_premise_values = " ".join(dataset["premise"])
joined_hypothesis_values = " ".join(dataset["hypothesis"])
all_words = joined_premise_values + joined_hypothesis_values
word_list = all_words.split(" ")
word_series = pandas.Series(word_list, dtype=str)
word_series = word_series[word_series != ""]
unique_value_counts = word_series.str.lower().str.strip().value_counts(ascending=False)
target_words = unique_value_counts[(unique_value_counts > 1000)]
target_words = target_words.index[target_words.index.str.len() > 1]
target_words = "\\b" + target_words + "\\b"
target_word_list = target_words.to_numpy().tolist()
stop_words_regex_expr = "|".join(target_word_list)
print(stop_words_regex_expr)

\bthe\b|\bof\b|\bto\b|\band\b|\bin\b|\bis\b|\bthat\b|\bit\b|\bfor\b|\bwas\b|\byou\b|\bde\b|\bare\b|\bon\b|\bbe\b|\bwith\b|\bthey\b|\bhave\b|\bnot\b|\bas\b


In [90]:
stop_words_regex = re.compile(stop_words_regex_expr)
dataset["premise"] = dataset["premise"].replace(to_replace=stop_words_regex, value="", regex=True)

dataset["hypothesis"] = dataset["hypothesis"].replace(to_replace=stop_words_regex, value="", regex=True)

In [91]:
dataset[50:100]

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
50,b0c2761b43,no i exclaimed astonished,no i cried out shock,en,English,0
51,61f15fd66d,mnamo mei tunapaswa kuhitimisha chaguzi za up...,kuna nafasi kwa wanachama kutengeza upya uana...,sw,Swahili,1
52,12840a5edd,电视晚餐带来了糟糕的污点。,电视上的人只吃早餐和午餐 。,zh,Chinese,2
53,714a367262,die unfähigkeit zur kommunikation war ein krit...,es war schwierig für die leute im world trade ...,de,German,0
54,37c01740c6,bauerstein had been at styles fatal night a...,styles responsible what happened,en,English,1
55,3d3149ec45,long ago--or away or whatever--there a world ...,erath only world has ever existed,en,English,2
56,6c12f1e611,then all time spill vase mrs inglethorp'...,so hidden another country impossible us l...,en,English,2
57,9832b523a0,because i always had do so i just pay someo...,i never developed a love gardening,en,English,1
58,71bcd59dd6,' she gets a little obsessive about her sauce,her sauce so complicated she's obsessed per...,en,English,1
59,aab0894630,rather kids today only little bundles joy b...,while kids today symbols success status th...,en,English,1
