In [1]:
import os
import pandas
import re

def create_directory(subfolder):
    outdir = f"Datasets/{subfolder}"
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    return outdir


def create_csv_file_for_models(language_selection="en", dataset_split="train"):
    dataset = pandas.read_csv(f"Datasets/Source/{dataset_split}.csv")

    # Lower case
    dataset["premise"] = dataset["premise"].apply(lambda x: x.lower())

    dataset["hypothesis"] = dataset["hypothesis"].apply(lambda x: x.lower())

    # Clear out punctuation and special characters
    spec_char_regex = r"[.!?,:;\"{}\[\]\-~<>|\\/+*/@#$%^`]+"
    dataset = dataset.replace(to_replace=spec_char_regex, value="", regex=True)

    # Clear out numbers
    numbers_regex = r"[0-9]+"
    dataset = dataset.replace(to_replace=numbers_regex, value="", regex=True)

    if language_selection.__eq__("en"):
        dataset = dataset[dataset.lang_abv == "en"]

    if dataset_split.__eq__("train"):
        dataset["label_name"] = "neutral"
        dataset.loc[dataset['label'] < 1, 'label_name'] = "entailment"
        dataset.loc[dataset['label'] > 1, 'label_name'] = "contradiction"

    outname = f"{dataset_split}_{language_selection}.csv"

    if dataset_split.__eq__("train"):
        bert_sample = dataset.sample(frac=0.5, random_state=42).reset_index(drop=True)
        electra_sample = dataset[~dataset.id.isin(bert_sample.id)].reset_index(drop=True)

        bert_sample.to_csv(os.path.join(create_directory("BERT"), outname))

        electra_sample.to_csv(os.path.join(create_directory("ELECTRA"), outname))
    else:
        dataset.to_csv(os.path.join(create_directory("Test"), outname))

In [None]:
create_csv_file_for_models()

create_csv_file_for_models("all_languages")

create_csv_file_for_models(dataset_split="test")

create_csv_file_for_models(language_selection="all_languages", dataset_split="test")

In [14]:
dataset = pandas.read_csv(f"Datasets/Source/train.csv")
dataset.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [15]:
spec_char_regex = r"[.!?,:;\"{}\[\]\-~<>|\\/+*/@#$%^`]+"
dataset = dataset.replace(to_replace=spec_char_regex, value="", regex=True)

numbers_regex = r"[0-9]+"
dataset = dataset.replace(to_replace=numbers_regex, value="", regex=True)

dataset["premise"] = dataset["premise"].apply(lambda x: x.lower())

dataset["hypothesis"] = dataset["hypothesis"].apply(lambda x: x.lower())

dataset.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,fdcb,comments considered formulating interim r...,rules developed interim put together com...,en,English,0
1,bab,issues wrestle practice groups law firm...,practice groups permitted work issues,en,English,2
2,fbea,des petites choses comme celleslà font une dif...,j'essayais d'accomplir quelque chose,fr,French,0
3,fcb,know ' really defend like somebody grown uh...,' defend age,en,English,0
4,aaab,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [16]:
joined_premise_values = "".join(dataset["premise"])
print(joined_premise_values)

