In [1]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
from multiprocessing import Pool
from scipy import spatial
from sklearn.decomposition import PCA
import xml.etree.ElementTree as ET



In [2]:
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [3]:
def readtags(doc):
    tree = ET.parse(doc)
    root = tree.getroot()
    name = doc.replace('.xml','')
    name = []
    for description in root.iter('seg'):
        p = description.text
        p=p.replace('\t','')
        p=p.replace('\n','')
        p = [p]
        name.append(p)
    return name

def readother(doc):
    tree = ET.parse(doc)
    root = tree.getroot()
    name = doc.replace('.xml','')
    name = []
    for description in root.iter('seg'):
        p = [description.text]
        name.append(p)
    return name

In [4]:
chinese = readtags('Chinese.xml')
arabic = readtags('Arabic.xml')
danish = readtags('Danish.xml')
dutch = readother('Dutch.xml')
english = readtags('English.xml')

In [5]:
english[0]

['In the beginning God created the heaven and the earth.']

In [6]:
for i in range(len(english)):
    english[i] = [word.lower() for word in english[i] if re.match('^[a-zA-Z]+', word)]
    english[i] = (' ').join(english[i])
    english[i] = [(re.sub(r'[^\w\s]','',english[i]))]
    english[i] = (' ').join(english[i])

In [7]:
english[0]

'in the beginning god created the heaven and the earth'

In [8]:
df = pd.DataFrame()
lan = ["English"] * len(english)
data_1 = pd.DataFrame({"Text": english,"Language": lan})
df.append(data_1)
data_1.head(1)

Unnamed: 0,Language,Text
0,English,in the beginning god created the heaven and th...


In [9]:
for i in range(len(arabic)):
    arabic[i] = (' ').join(arabic[i]).replace('.','')

In [10]:
arabic[:2]

['في البدء خلق الله السموات والارض',
 'وكانت الارض خربة وخالية وعلى وجه الغمر ظلمة وروح الله يرف على وجه المياه']

In [11]:
df = pd.DataFrame()
lan = ["Arabic"] * len(arabic)
data_2 = pd.DataFrame({"Text": arabic,"Language": lan})
df.append(data_2)
data_2.head(1)

Unnamed: 0,Language,Text
0,Arabic,في البدء خلق الله السموات والارض


In [12]:
for i in range(len(dutch)):
    dutch[i] = [x for x in dutch[i] if x is not None]
    dutch[i] = [word.lower() for word in dutch[i]]
    dutch[i] = ('').join(dutch[i]).replace('.','')
    dutch[i] = (re.sub(r'[^\w\s]','',dutch[i]))

In [13]:
df = pd.DataFrame()
lan = ["Dutch"] * len(dutch)
data_3 = pd.DataFrame({"Text": dutch,"Language": lan})
df.append(data_3)
data_3.head(1)

Unnamed: 0,Language,Text
0,Dutch,in het begin heeft god de hemelen en de aarde ...


In [14]:
for i in range(len(danish)):
    danish[i] = [word.lower() for word in danish[i]]
    danish[i] = [x for x in danish[i] if x is not None]
    danish[i] = ('').join(danish[i]).replace('.','')
    danish[i] = (re.sub(r'[^\w\s]','',danish[i]))

In [15]:
df = pd.DataFrame()
lan = ["Danish"] * len(danish)
data_4 = pd.DataFrame({"Text": danish,"Language": lan})
df.append(data_4)
data_4.head(1)

Unnamed: 0,Language,Text
0,Danish,i begyndelsen skabte gud himmelen og jorden


In [16]:
for i in range(len(chinese)):
    chinese[i] = ('').join(chinese[i])
    chinese[i] = (re.sub(r'[^\w\s]','',chinese[i]))

In [17]:
df = pd.DataFrame()
lan = ["Chinese"] * len(chinese)
data_5 = pd.DataFrame({"Text": chinese,"Language": lan})
df.append(data_5)
data_5.head(1)

Unnamed: 0,Language,Text
0,Chinese,起初 神 創造 天地


In [18]:
data = pd.concat([data_1, data_2,data_3,data_4,data_5], ignore_index=True)

In [19]:
data['Language'].value_counts()

Danish     31103
English    31102
Arabic     31102
Chinese    31101
Dutch      29098
Name: Language, dtype: int64

In [20]:
from sklearn.utils import shuffle
data = shuffle(data)

In [21]:
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 122804
Test size: 30702


In [22]:
train = data['Text'][:train_size]
train_p = data['Language'][:train_size]

test = data['Text'][train_size:]
test_p = data['Language'][train_size:]

In [23]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [24]:
tokenize.fit_on_texts(train) # only fit on train
x_train = tokenize.texts_to_matrix(train)
x_test = tokenize.texts_to_matrix(test)

In [25]:
encoder = LabelEncoder()
encoder.fit(train_p)
y_train = encoder.transform(train_p)
y_test = encoder.transform(test_p)

In [26]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [27]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (122804, 1000)
x_test shape: (30702, 1000)
y_train shape: (122804, 5)
y_test shape: (30702, 5)


In [28]:
batch_size = 32
epochs = 1
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 110523 samples, validate on 12281 samples
Epoch 1/1


In [29]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.019945913208747872
Test accuracy: 0.9940720474236207


In [31]:
text_labels = encoder.classes_ 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test.iloc[i][:100])
    print('Actual label:' + test.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

og de kastede lod imellem dem og loddet faldt på matthias og han blev regnet sammen med de elleve ap
Actual label:og de kastede lod imellem dem og loddet faldt på matthias og han blev regnet sammen med de elleve apostle
Predicted label: Danish

and he shall take the cedar wood and the hyssop and the scarlet and the living bird and dip them in 
Actual label:and he shall take the cedar wood and the hyssop and the scarlet and the living bird and dip them in the blood of the slain bird and in the running water and sprinkle the house seven times
Predicted label: English

de overige apostelen en de volgelingen van jezus die in israël woonden hoorden dat nu ook mensen van
Actual label:de overige apostelen en de volgelingen van jezus die in israël woonden hoorden dat nu ook mensen van een ander volk het woord van god hadden aangenomen
Predicted label: Dutch

men denne har efter at have ofret eet offer for synderne sat sig for bestandig ved guds højre hånd
Actual label:men denne har efter at ha