### The goal of this notebook is to :

##### 1. Avoid loading large files into memory.
##### 2. Build a train, validation set that ensures balanced classes.
#####  3. The size of the validation set  is equal to the size of the test set. 
#####  4. Avoid loading samples that exist in the test set. 
####  5. Preprocess the test set. 


In [17]:
from glob import glob
import pandas as pd
import re

In [5]:
test_lines = []
test_labels = []
with open("europarl.test", 'r') as f:
    for line in f:
        test_labels.append(line.split()[0])
        test_lines.append(" ". join( x for x in  re.sub(r'<.*?>', '', line).lower().strip().split()[1:]))
       # test_lines.append(line)


In [6]:
test = pd.DataFrame({"text":test_lines, "lang":test_labels})
test.columns = ["lang", "text"]
labels = test["lang"].unique()

In [7]:
test.head()

Unnamed: 0,lang,text
0,bg,"""европа 2020"" не трябва да стартира нов конкур..."
1,bg,(cs) най-голямата несправедливост на сегашната...
2,bg,"(de) г-жо председател, г-н член на комисията, ..."
3,bg,"(de) г-н председател, бих искал да започна с к..."
4,bg,"(de) г-н председател, въпросът за правата на ч..."


In [8]:
test["lang"].value_counts()

et    1000
it    1000
hu    1000
lv    1000
en    1000
es    1000
bg    1000
pt    1000
fi    1000
cs    1000
sl    1000
ro    1000
de    1000
sk    1000
lt    1000
pl    1000
sv    1000
da    1000
el    1000
fr    1000
nl    1000
Name: lang, dtype: int64

The 5 shortest samples 

## create a validation set that matches the size of the test set. 
For the training set we will sample 5000 for eac class

In [9]:
test_size = 1000
valid_size = test_size 
train_size = test_size *4
data_size = valid_size  + train_size 

In [18]:
lang_files = glob("europarl-v7*")

In [10]:
import re
train_texts = []
train_lang = []
valid_texts = []
valid_lang = []


for file in lang_files:
    counter = 0
    texts = []
    languages = []
    lang = file[-2:]
    with open(file) as f:
        if lang not in languages: 
            for line in f: 
                    if line[0] != "(" and counter < data_size and len(line.split())>2 and line[0] != "-" :  # we notice that the files contain non target languages examples put in parenthesis
                        line = re.sub(r'<.*?>', '', line).lower().strip()
                        if line[-1]!="." and "report" not in line and line not in test["text"].values:
                            texts.append(line)
                            languages.append(lang)
                            counter += 1
            train_texts.extend(texts[:train_size])
            train_lang.extend(languages[:train_size])
            valid_texts.extend(texts[train_size:])
            valid_lang.extend(languages[train_size:])
            

In [11]:
train = pd.DataFrame({"text": train_texts, "lang":train_lang})
valid = pd.DataFrame({"text": valid_texts, "lang":valid_lang})

In [13]:
train.drop(train[train["lang"]=="en"].index[train_size:], inplace=True)
valid.drop(valid[valid["lang"]=="en"].index[valid_size:], inplace=True)

In [16]:
train.to_csv("train_lang.csv", index = False)
valid.to_csv("valid_lang.csv", index = False)
test.to_csv("test_lang.csv", index = False)

In [15]:
valid["lang"].value_counts()

et    1000
it    1000
hu    1000
lv    1000
en    1000
es    1000
bg    1000
pt    1000
fi    1000
cs    1000
sl    1000
ro    1000
de    1000
sk    1000
lt    1000
pl    1000
sv    1000
da    1000
el    1000
fr    1000
nl    1000
Name: lang, dtype: int64

In [12]:
train["lang"].value_counts()

sk    4000
nl    4000
en    4000
fi    4000
pl    4000
et    4000
it    4000
lv    4000
fr    4000
de    4000
da    4000
el    4000
sl    4000
lt    4000
pt    4000
bg    4000
hu    4000
ro    4000
cs    4000
sv    4000
es    4000
Name: lang, dtype: int64