In [42]:
import numpy as np
import os
import sys

# tokenize the dataset

In [43]:
def tokenize(path, file, save=True):
    """
    Tokenize a text file to characters and save it to a new file.
    """
    data_set_type = path.split('/')[1]
    with open(f'{path}/{file}', 'r') as f:
        text = f.read()
        text = text.lower()
        # split the text into characters
        text = list(text)
        # reconstruct the text
        text = ' '.join(text)
        if save:
            with open(f'{path}/{data_set_type}_tok/{file}.tok', 'w') as f:
                f.write(text)
        return f'{path}/{data_set_type}_tok/{file}.tok'

# load all train corpora
all_train_text = os.listdir('Europarl/train')
# remove unwanted files
all_train_text.remove('Icon_')
all_train_text.remove('train_tok')

# tokenize all train corpora
for file in all_train_text:
    tokenize("Europarl/train",file)

# Train models

I applied heavy smoothing so it can be more robust to unseen grams of the the same language test. 

Lower order is chosen given that the task is not to predict the next word, but to predict the language, which is not order and structure of the sentence.

In [44]:
import subprocess
import time
def train(train_base):
    corpus, lang = train_base.split(".")[:2]
    command = f"ngram-count -text {train_base} -order 2 -lm LMs/euro_LMs/{lang}.lm -addsmooth 10"
    os.system(command)
    
        
train_base = os.listdir('Europarl/train/train_tok')
for base in train_base:
    train(f'Europarl/train/train_tok/{base}')

# Predictions On Dev

In [45]:
# predict the language of a given text

def predict(text):
    # get the language models
    lms = os.listdir('LMs/euro_LMs')
    dir1, dir2 = text.split('/')[:2]
    path = f'{dir1}/{dir2}'
    file = text.split('/')[-1]
    text = tokenize(path,file, save=True)
    time.sleep(1)
    ppls = []
    for lm in lms:
        # get the language
        lang = lm.split('.')[0]
        # get the ppl of the text given the language model
        ppl = subprocess.check_output(f"ngram -lm LMs/euro_LMs/{lm} -ppl {text}", shell=True)
        # get the ppl
        ppl = ppl.decode('utf-8').split('ppl=')[1].split(' ')[1]
    
        # append the ppl and language to the list
        ppls.append((ppl, lang))
    # sort the list by the pplabilities
    ppls.sort(key=lambda x: x[0])
    # return the language with the highest ppl
    return ppls[0][1]


# load all dev corpora
devs = os.listdir('Europarl/dev')

# remove unwanted files
devs.remove('dev.gold')
devs.remove('dev_tok')

# sort to maintain order
devs = sorted(devs, key=lambda x: int(x.split('.')[1]))

y_pred = []
for dev in devs:
    y_pred.append(predict(f'Europarl/dev/{dev}'))
    
# load labels
y_true = open('Europarl/dev/dev.gold', 'r').read().split('\n')
y_true = [x.split('\t')[1] for x in y_true]

# calculate accuracy
from sklearn.metrics import accuracy_score
print('Accuracy: ')
print(accuracy_score(y_true, y_pred))

Accuracy: 
0.8181818181818182


In [46]:
for i,j in zip(y_true, y_pred):
    print(i,j)

it bg
lt lt
et et
fr fr
hu hu
lv lv
cs cs
en en
da da
de el
mt mt
nl nl
pl pl
fi fi
pt pt
ro ro
sk sk
sl sl
sv sv
bg el
el sl
es es


# Predictions On Test

In [49]:

# load all test corpora
test_files = os.listdir('Europarl/test')
test_files.remove('test_tok')
test_files = sorted(test_files, key=lambda x: int(x.split('.')[1]))

y_pred = []
for test in test_files:
    y_pred.append(predict(f'Europarl/test/{test}'))
    
with open('Europarl/test/test.pred', 'w') as f:
    for label, test_file in zip(y_pred, test_files):
        f.write(f'{test_file}\t{label}\n')
        