In [17]:
import numpy as np
import pandas as pd
import re
import unicodedata

In [18]:
def normalize_text(file):
    with open(file, encoding="utf8") as f:
        paragraphs = f.readlines()
        
    sentences = []
    for p in paragraphs:
         sentences += re.split('\. |\! \? ', p)

    for i in range(len(sentences)):
        sentences[i] = re.sub('\.', ' ', sentences[i])
        sentences[i] = sentences[i].lower()
        sentences[i] = unicodedata.normalize('NFD', sentences[i])
        sentences[i] = re.sub('[^a-z0 ]', '', sentences[i])
        sentences[i] = sentences[i].strip()
        #sentences[i] = re.sub('(?<=0)[^0]{1}', '\1 ', sentences[i])
       
    sentences = [s for s in sentences if s]
    return sentences

In [16]:
sentences = normalize_text('data/train.en.txt')
len(sentences)

5820

In [43]:
def make_model(sentences, discount):
    
    characters = list(set(''.join(sentences)))
    
    for i, s in enumerate(sentences):
        sentences[i] = '[[' + s + ']'  

    trigrams = {}
    s = sentences[0]
    for c1 in (characters + ['[']):
        for c2 in characters + ['['] + [']'] :
            for c3 in (characters + [']']):
                trigrams[c1+c2+c3] = 0

    n_trigrams = 0
    for s in sentences:
        for i in range(len(s)-2):
            trigram = s[i:i+3]
            if trigrams[trigram] == 0:
                n_trigrams += 1
            trigrams[trigram] += 1            

    charity = discount * n_trigrams / (len(trigrams)-n_trigrams)
    for trigram, count in trigrams.items():

        if count == 0:
            trigrams[trigram] = charity
        else:
            trigrams[trigram] -= discount
            
    bigrams = {}
    for trigram, count in trigrams.items():
        bigram = trigram[:2]
        if bigram in bigrams:
            bigrams[bigram] += count
        else:
            bigrams[bigram] = count
            
    return (trigrams, bigrams)

In [39]:
def generate(model):
    trigrams, bigrams = model
    gen = '[['
    while gen[-1] != ']':
        filtered_trigrams = {}
        for trigram, count in trigrams.items():
            if trigram[0:2] == gen[-2:]:
                filtered_trigrams[trigram] = count
        chars = [k[-1] for k in filtered_trigrams.keys()]
        values = np.asarray(list(filtered_trigrams.values()))
        gen += np.random.choice(chars, p=values/bigrams[gen[-2:]])
    gen = gen[2:-1]
    print(gen)

In [21]:
def perplexity(model, sentences):
    trigrams, bigrams = model
    
    P = 0
    for s in sentences:
        for i in range(2, len(s)):
            P += np.log2(trigrams[s[i-2:i+1]]/bigrams[s[i-2:i]])
    T = len(''.join(sentences))
    return np.exp2((-1/T)*P)

In [22]:
def format_validation(file):
    with open(file, encoding="utf8") as f:
        lines = f.readlines()        
    sentences = []
    for l in lines:
        sentences += ['[['+l[:-2]+']']
    return sentences

In [100]:
# make model
model = make_model(normalize_text('data/train.en.txt'), 0.5)

In [102]:
# show conditional probabilities for trigrams starting in 'th'
filtered_trigrams = {}
for trigram, count in model[0].items():
    if trigram[0:2] == 'th':
        filtered_trigrams[trigram] = np.round(count / model[1]['th'], 5)
dict(sorted(filtered_trigrams.items(), key=lambda item: item[1], reverse = True))

{'the': 0.68113,
 'th ': 0.09821,
 'tha': 0.08313,
 'thi': 0.04468,
 'tho': 0.03815,
 'thr': 0.03384,
 'thn': 0.00455,
 'ths': 0.00407,
 'th]': 0.00293,
 'thu': 0.00232,
 'thy': 0.00192,
 'thc': 0.00192,
 'thl': 0.00091,
 'thw': 0.00064,
 'thd': 0.00057,
 'thm': 0.00051,
 'tht': 0.00024,
 'thp': 0.0001,
 'thf': 3e-05,
 'thb': 3e-05,
 'thh': 3e-05,
 'thg': 1e-05,
 'thx': 1e-05,
 'thq': 1e-05,
 'thj': 1e-05,
 'th0': 1e-05,
 'thk': 1e-05,
 'thz': 1e-05,
 'thv': 1e-05}

In [75]:
generate(model)

soodersosstrastrop volstitste para relnalanaan delsen die omgcds nap die genent not die bes let dinge dier n 0 hoeksie pubkom nolke laantigsfis d bacoeie ste terned


In [91]:
# Perplexities
files = ['data/val.af.txt', 'data/val.en.txt', 'data/val.nl.txt',\
         'data/val.xh.txt', 'data/val.zu.txt']  
languages = ['Afrikaans', 'English', 'Dutch', 'isiXhosa', 'isiZulu']
perplexities = []

for i in range(5):
    print(languages[i] +': '+ str(np.round(perplexity(model, format_validation(files[i])),2))) 

Afrikaans: 27.62
English: 18.64
Dutch: 27.14
isiXhosa: 9.34
isiZulu: 8.37


In [104]:
# alpha tuning
train_files = ['data/train.af.txt', 'data/train.en.txt', 'data/train.nl.txt',\
               'data/train.xh.txt', 'data/train.zu.txt']

val_files = ['data/val.af.txt', 'data/val.en.txt', 'data/val.nl.txt',\
             'data/val.xh.txt', 'data/val.zu.txt']  
models = []
best_discounts = []
for i in range(5):
    min_p = 999
    for discount in np.arange(0.1, 1.0, 0.1):
        model = make_model(normalize_text(train_files[i]), discount)
        p = perplexity(model, format_validation(val_files[i]))
        if p < min_p:
            min_p = p
            best_model = model
            best_discount = discount
    models += [best_model]
    best_discounts += [best_discount]

In [250]:
best_discounts

[0.4, 0.5, 0.4, 0.4, 0.5]

In [255]:
# Perplexities (tuned models)
files = ['data/val.af.txt', 'data/val.en.txt', 'data/val.nl.txt',\
         'data/val.xh.txt', 'data/val.zu.txt']  
languages = ['Afrikaans', 'English', 'Dutch', 'isiXhosa', 'isiZulu']
perplexities = []

for i in range(5):
    print(languages[i] +': '+ str(perplexity(models[1], format_validation(files[i])))) 

Afrikaans: 19.490588251071255
English: 7.5738326934989795
Dutch: 19.344719613124187
isiXhosa: 41.106944956814445
isiZulu: 44.45049600247382


In [122]:
# format
test_file = 'data/test.lid.txt'
with open(test_file, encoding="utf8") as f:
    lines = f.readlines()
labels = []
sentences = []
languages = ['af', 'en', 'nl', 'xh', 'zu']
for l in lines:
    labels += [languages.index(l[:2])]  
    sentences += ['[['+l[3:-2]+']']

# classify
classes = []
for s in sentences:  
    if len(s) == 0:
        continue
    perplexities = []
    for m in models:
        perplexities += [perplexity(m, [s])]
    classes += [np.argmin(perplexities)]
acc = (np.asarray(labels) == np.asarray(classes)).mean()
print (str(acc*100) + '% accuracy on test data')

91.60000000000001% accuracy on test data


In [125]:
# show mistakes
for i in range(len(labels)):
    if labels[i] != classes[i]:
        print(labels[i], classes[i], sentences[i])

4 3 [[umbhalo]
2 0 [[zuidafrika]
4 3 [[brough amamormon labo ababebhapathizwa 000 000 ukuqothulwa kwesizwe izisulu kwakhe abangu eshukunyiswa uthando nesihe futhi abazange baqonde isenzo sabo kungase kubacasule amajuda]
4 3 [[iquran ayikaze ikhulume ngoyise kajona kodwa isiko lamasulumane lifundisa ukuthi ujona wayevela esizweni sakwabenjamini nokuthi ubaba wakhe kwakungu amittai]
2 0 [[bevolking]
4 3 [[ukubaluleka kanye neziqu eziyisisekelo]
4 3 [[ukulungiselela]
4 3 [[kodwake ukusebenzela imishini akudingeki ukuze uqhubeke ubulungu besonto]
4 0 [[amaallergener]
4 3 [[emashumini amabili edlule isonto lds likhuphule ikhwelo lalo lokuba izithunywa zevangeli esezikhulile]
4 3 [[okukhulu kunakho konke lokhu yi mormonism ejwayelekile echazwe ubuholi bethe church of jesus christ of latterday saints lds church ukunyakaza okubili okubanzi ngaphandle kwamamormonism ajwayelekile yi mormon basicism kanye neliberal reformist mormonism]
4 3 [[smith]
0 2 [[hewiston]
1 0 [[rfc 0000 p]
3 4 [[ulalise 

In [127]:
# byte-pair encoding
token_sets = []
for file in train_files:
    sentences = normalize_text(file)
    tokens = []

    for iteration in range(100):
        merges = {}

        for i in range(len(sentences)):
            sentences[i] = [*sentences[i]]

        for s in sentences:
            for t in range(len(s)-1):
                merge = ''.join(s[t:t+2])
                if merge in merges:
                    merges[merge] += 1
                else:
                    merges[merge] = 1

        new_token = max(merges, key=merges.get)
        tokens += [new_token]

        for s in sentences:
            for t in range(len(s)-1):     
                if ''.join(s[t:t+2]) == new_token:
                    s[t] = new_token
                    s.pop(t+1)

        if (iteration % 25 == 24):
            print(iteration)
    token_sets += [tokens]


24
49
74
99
24
49
74
99
24
49
74
99
24
49
74
99
24
49
74
99


In [121]:
# number of common subword units
len(list(set(token_sets[3]) & set(token_sets[4])))

81