## Extracting datat from xnli and rte3, cleaning and combining them in on big dataset

In [1]:
import pandas as pd 
import os
import xml
from tqdm import tqdm

data_path = "../data/"
rte3_path = data_path + "rte3/"
xnli_path = data_path + "xnli/"
mnli_path = data_path + "mnli/"

preprocessed_path = data_path + "preprocessed/"
rte3_preprocessed_path = os.path.join(preprocessed_path, "rte3.tsv")
xnli_preprocessed_path = os.path.join(preprocessed_path, "xnli.tsv")
mnli_preprocessed_path = os.path.join(preprocessed_path, "mnli.tsv")

### Process RTE3

In [2]:
columns=["sentenceA", "sentenceB", "label"]
dataframe = pd.DataFrame(columns=columns)

import xml.etree.ElementTree as ET
devel_file = os.path.join(rte3_path, "german_social_media_DEVEL.xml")
test_file = os.path.join(rte3_path, "german_social_media_TEST.xml")


def parse_and_collect_rte(filepath, rte3_dataframe):
    dom = ET.parse(filepath)
    for pair in dom.iter("pair"):
        if pair.tag == "pair":
            label = pair.attrib["entailment"]
            sentenceA = pair[0].text
            sentenceB = pair[1].text

            rte3_dataframe = rte3_dataframe.append({"sentenceA": sentenceA, "sentenceB": sentenceB, "label": label.lower().replace("nonentailment","contradiction")}, ignore_index=True)
    return rte3_dataframe

dataframe = parse_and_collect_rte(devel_file, dataframe)
dataframe = parse_and_collect_rte(test_file, dataframe)
dataframe.to_csv(os.path.join(data_path + "preprocessed", "rte3.tsv"),sep="\t", encoding="utf8", index=False)

dataframe.head(), len(dataframe)

(                                           sentenceA  \
 0  Hallo Folgendes Problem, der CPU ist bei 100%....   
 1  Hallo, habe mir ein Asus P5Q geholt und am Anf...   
 2  hallo liebe mods, ich schreibe diesen beitrag ...   
 3  Hallo, ich habe Probleme mit dem Trojaner '' T...   
 4  Hallo, ich habe Probleme mit dem Trojaner '' T...   
 
                                            sentenceB       label  
 0  Der Computer ist durch Malware verseucht, der ...  entailment  
 1  Die USB-Anschlüsse funktionieren bei dem neuen...  entailment  
 2  Beim Aufrufen des p5q deluxe Sammelthreads bek...  entailment  
 3            Ein Trojaner kann nicht entfernt werden  entailment  
 4  Mein Virenprogramm AntiVir meldet immer wieder...  entailment  ,
 3014)

### Process XNLI

In [3]:
def parse_and_collect_nli(filepath, xnli_dataframe, nrows=30000):
    xnli_data = pd.read_csv(filepath, encoding="utf8", sep='\t', error_bad_lines=False, low_memory=True, dtype='unicode', nrows=nrows)
    for idx, row in tqdm(xnli_data.iterrows(), total=len(xnli_data)):
        language_key ="language"
        if (language_key in row and row[language_key] == "de") or language_key not in row:
            xnli_dataframe = xnli_dataframe.append({"sentenceA": row["sentence1"], "sentenceB": row["sentence2"], "label": row["gold_label"].lower()}, ignore_index=True)
    return xnli_dataframe

In [4]:
columns=["sentenceA", "sentenceB", "label"]
dataframe = pd.DataFrame(columns=columns)

devel_file = os.path.join(xnli_path, "xnli.dev.tsv")
test_file = os.path.join(xnli_path, "xnli.test.tsv")
    
dataframe = parse_and_collect_nli(devel_file, dataframe)
dataframe = parse_and_collect_nli(test_file, dataframe)
dataframe.to_csv(os.path.join(data_path + "preprocessed", "xnli.tsv"), sep="\t", encoding="utf8", index=False)

dataframe.head(), len(dataframe)

100%|██████████████████████████████████| 30000/30000 [00:06<00:00, 4301.08it/s]
100%|██████████████████████████████████| 30000/30000 [00:12<00:00, 2352.02it/s]


(                                           sentenceA  \
 0            und er hat gesagt, Mama ich bin daheim.   
 1            und er hat gesagt, Mama ich bin daheim.   
 2            und er hat gesagt, Mama ich bin daheim.   
 3  Ich wusste nicht was ich vorhatte oder so, ich...   
 4  Ich wusste nicht was ich vorhatte oder so, ich...   
 
                                            sentenceB          label  
 0  Er rief seine Mutter an, sobald er aus dem Sch...        neutral  
 1                                Er sagte kein Wort.  contradiction  
 2  Er sagte seiner Mutter, er sei nach Hause geko...     entailment  
 3  Ich war noch nie in Washington, deshalb habe i...        neutral  
 4  Ich wusste genau, was ich tun musste, als ich ...  contradiction  ,
 7500)

### Combin both and build sets

In [5]:
xnli_dataframe = pd.read_csv(rte3_preprocessed_path, encoding="utf8", sep='\t')
rte3_dataframe = pd.read_csv(xnli_preprocessed_path, encoding="utf8", sep='\t')

all_data = pd.concat([xnli_dataframe, rte3_dataframe])
all_data = all_data.sample(frac=1)

train_size=0.8
train_set = all_data.sample(frac=train_size, random_state=0)
test_set = all_data.drop(train_set.index)

train_set.to_csv(os.path.join(preprocessed_path, "train.tsv"), sep="\t", encoding="utf8", index=False)
test_set.to_csv(os.path.join(preprocessed_path, "test.tsv"), sep="\t", encoding="utf8", index=False)

## Concat, translate and prepare

In [6]:
dataframe = pd.DataFrame(columns=columns)

dataframe = parse_and_collect_nli(os.path.join(mnli_path, "multinli_1.0_dev_matched.txt"), dataframe)
dataframe = parse_and_collect_nli(os.path.join(mnli_path, "multinli_1.0_dev_mismatched.txt"), dataframe)
dataframe.to_csv(os.path.join(preprocessed_path, "mnli_dev.tsv"), sep="\t", encoding="utf8", index=False)

dataframe = pd.DataFrame(columns=columns)
dataframe = parse_and_collect_nli(os.path.join(mnli_path, "multinli_1.0_train.txt"), dataframe, nrows=10000)
dataframe.to_csv(os.path.join(preprocessed_path, "mnli_train.tsv"), sep="\t", encoding="utf8", index=False)
len(dataframe)

100%|█████████████████████████████████████| 9897/9897 [00:22<00:00, 449.70it/s]
100%|███████████████████████████████████| 10000/10000 [00:28<00:00, 356.14it/s]
100%|███████████████████████████████████| 10000/10000 [00:22<00:00, 435.10it/s]


10000

## Translate train to german

In [7]:
import os
import csv
import six
from tqdm import tqdm
from google.cloud import translate_v2 as translate

# set creds env
cwd = os.getcwd()
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "D:/Repositories/semantic-match-classifier/coronacheck-ai-9172d153d26a.json"
demo_text = "This is a medical fact."

translate_client = translate.Client()

def do_translate(text):
    if isinstance(text, six.binary_type):
        text = text.decode('utf-8')

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(
        text, target_language="de", source_language="en")
    return result['translatedText']

dataframe_translated = pd.DataFrame(columns=columns)
dataframe = pd.read_csv(os.path.join(preprocessed_path, "mnli_train.tsv"), sep="\t", encoding="utf8")
for idx, row in tqdm(dataframe.iterrows(), total=len(dataframe)):
    if not pd.isna(row["sentenceA"]) and not pd.isna(row["sentenceB"]):
        sentence_a_trans = do_translate(row["sentenceA"])
        sentence_b_trans = do_translate(row["sentenceB"])
        dataframe_translated = dataframe_translated.append({"sentenceA": sentence_a_trans, "sentenceB": sentence_b_trans, "label": row["label"]}, ignore_index=True)
    else:
        print(row["sentenceA"], row["sentenceB"])
dataframe_translated.to_csv(os.path.join(preprocessed_path, "mnli_train_translated.tsv"), sep="\t", encoding="utf8", index=False, quoting=csv.QUOTE_ALL)

  1%|▍                                     | 117/10000 [00:12<16:30,  9.98it/s]

Native  'Me win, me passum heap big law ... nan


 33%|████████████▎                        | 3318/10000 [18:27<34:46,  3.20it/s]

Saint-Germain-des-Pr??s nan


100%|████████████████████████████████████| 10000/10000 [55:44<00:00,  2.99it/s]
