In [1]:
import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [2]:
df = pd.read_csv('../data/fin-mk.txt', sep='\t')
df.head()

Unnamed: 0,finnish,macedonian
0,chaplinin poika,чарли чаплин во детето
1,"kuva hymyllä, ja ehkä kyyneleen kera.","слика со насмевка и можеби, солза."
2,hyväntekeväisyyssairaala,добротворна болница
3,"nainen, jonka synti oli äitiys.",жената чии грев беше мајчинството.
4,mies.,мажот.


In [3]:
fin = df['finnish'].values
mk = df['macedonian'].values

In [4]:
fin_tok = df["finnish"].fillna("").map(nltk.word_tokenize).values
mk_tok = df["macedonian"].fillna("").map(nltk.word_tokenize).values

In [5]:
def remove_non_ascii(vocab):
    data = []
    temp = []
    for sent in vocab:
        for word in sent:
            temp.append(re.sub(r'[^\x00-\x7f]',r'', word))
        data.append(temp)
        temp = []
    return data

# no_ascii_fin = remove_non_ascii(fin_tok)

In [6]:
def remove_dash(vocab):
    data = []
    temp = []
    for sent in vocab:
        for word in sent:
            temp.append(word.replace('-', ' '))
        data.append(temp)
        temp = []
    return data

clean_fin = remove_dash(fin_tok)
clean_mk = remove_dash(mk_tok)

In [7]:
def remove_three_dots(vocab):
    data = []
    temp = []
    for sent in vocab:
        for word in sent:
            temp.append(word.replace('...', ''))
        data.append(temp)
        temp = []
    return data

clean_fin = remove_three_dots(clean_fin)
clean_mk = remove_three_dots(clean_mk)

In [8]:
def remove_two_dots(vocab):
    data = []
    temp = []
    for sent in vocab:
        for word in sent:
            temp.append(word.replace('..', ''))
        data.append(temp)
        temp = []
    return data

clean_fin = remove_two_dots(clean_fin)
clean_mk = remove_two_dots(clean_mk)

In [9]:
def remove_dollar(vocab):
    data = []
    temp = []
    for sent in vocab:
        for word in sent:
            temp.append(word.replace('$', ''))
        data.append(temp)
        temp = []
    return data

clean_fin = remove_dollar(clean_fin)
clean_mk = remove_dollar(clean_mk)

In [10]:
def remove_digits(vocab):
    data = []
    temp = []
    for sent in vocab:
        for word in sent:
            temp.append(re.sub(r'\d+', '', word))
        data.append(temp)
        temp = []
    return data

clean_fin = remove_digits(clean_fin)
clean_mk = remove_digits(clean_mk)

In [43]:
def detokenize_data(vocab):
    data = []
    for sent in vocab:
        data.append(TreebankWordDetokenizer().detokenize(sent))
    return data

final_fin = detokenize_data(clean_fin)
final_mk = detokenize_data(clean_mk)

In [44]:
clean_fin[1]

['kuva', 'hymyllä', ',', 'ja', 'ehkä', 'kyyneleen', 'kera', '.']

In [45]:
data = pd.DataFrame({'finnish': final_fin, 'macedonian': final_mk})
data.to_csv('../data/fin-mk.txt', index=False, sep='\t')