In [0]:
!pip install -U -q PyDrive

In [0]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
from multiprocessing import Pool
from scipy import spatial
from sklearn.decomposition import PCA
import xml.etree.ElementTree as ET

In [4]:
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [0]:
def readtags(doc):
    tree = ET.parse(doc)
    root = tree.getroot()
    name = doc.replace('.xml','')
    name = []
    for description in root.iter('seg'):
        p = description.text
        p=p.replace('\t','')
        p=p.replace('\n','')
        p = [p]
        name.append(p)
    return name

def readother(doc):
    tree = ET.parse(doc)
    root = tree.getroot()
    name = doc.replace('.xml','')
    name = []
    for description in root.iter('seg'):
        p = [description.text]
        name.append(p)
    return name

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code
Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
chinese = readtags('/content/gdrive/My Drive/Chinese.xml')
arabic = readtags('/content/gdrive/My Drive/Arabic.xml')
danish = readtags('/content/gdrive/My Drive/Danish.xml')
dutch = readother('/content/gdrive/My Drive/Dutch.xml')
english = readtags('/content/gdrive/My Drive/English.xml')

In [15]:
english[0]

['In the beginning God created the heaven and the earth.']

In [0]:
for i in range(len(english)):
    english[i] = [word.lower() for word in english[i] if re.match('^[a-zA-Z]+', word)]
    english[i] = (' ').join(english[i])
    english[i] = [(re.sub(r'[^\w\s]','',english[i]))]
    english[i] = (' ').join(english[i])

In [17]:
english[0]

'in the beginning god created the heaven and the earth'

In [18]:
df = pd.DataFrame()
lan = ["English"] * len(english)
data_1 = pd.DataFrame({"Text": english,"Language": lan})
df.append(data_1)
data_1.head(1)

Unnamed: 0,Language,Text
0,English,in the beginning god created the heaven and th...


In [0]:
for i in range(len(arabic)):
    arabic[i] = (' ').join(arabic[i]).replace('.','')

In [20]:
df = pd.DataFrame()
lan = ["Arabic"] * len(arabic)
data_2 = pd.DataFrame({"Text": arabic,"Language": lan})
df.append(data_2)
data_2.head(1)

Unnamed: 0,Language,Text
0,Arabic,في البدء خلق الله السموات والارض


In [21]:
arabic[:2]

['في البدء خلق الله السموات والارض',
 'وكانت الارض خربة وخالية وعلى وجه الغمر ظلمة وروح الله يرف على وجه المياه']

In [0]:
for i in range(len(dutch)):
    dutch[i] = [x for x in dutch[i] if x is not None]
    dutch[i] = [word.lower() for word in dutch[i]]
    dutch[i] = ('').join(dutch[i]).replace('.','')
    dutch[i] = (re.sub(r'[^\w\s]','',dutch[i]))

In [23]:
df = pd.DataFrame()
lan = ["Dutch"] * len(dutch)
data_3 = pd.DataFrame({"Text": dutch,"Language": lan})
df.append(data_3)
data_3.head(1)

Unnamed: 0,Language,Text
0,Dutch,in het begin heeft god de hemelen en de aarde ...


In [0]:
for i in range(len(danish)):
    danish[i] = [word.lower() for word in danish[i]]
    danish[i] = [x for x in danish[i] if x is not None]
    danish[i] = ('').join(danish[i]).replace('.','')
    danish[i] = (re.sub(r'[^\w\s]','',danish[i]))

In [25]:
df = pd.DataFrame()
lan = ["Danish"] * len(danish)
data_4 = pd.DataFrame({"Text": danish,"Language": lan})
df.append(data_4)
data_4.head(1)

Unnamed: 0,Language,Text
0,Danish,i begyndelsen skabte gud himmelen og jorden


In [0]:
for i in range(len(chinese)):
    chinese[i] = ('').join(chinese[i])
    chinese[i] = (re.sub(r'[^\w\s]','',chinese[i]))

In [27]:
df = pd.DataFrame()
lan = ["Chinese"] * len(chinese)
data_5 = pd.DataFrame({"Text": chinese,"Language": lan})
df.append(data_5)
data_5.head(1)

Unnamed: 0,Language,Text
0,Chinese,起初 神 創造 天地


In [0]:
data = pd.concat([data_1, data_2,data_3,data_4,data_5], ignore_index=True)

In [30]:
data['Language'].value_counts()

Danish     31103
Arabic     31102
English    31102
Chinese    31101
Dutch      29098
Name: Language, dtype: int64

In [0]:
from sklearn.utils import shuffle
data = shuffle(data)

In [32]:
train_size = int(len(data) * .8)
print ("Train size: %d" % train_size)
print ("Test size: %d" % (len(data) - train_size))

Train size: 122804
Test size: 30702


In [0]:
train = data['Text'][:train_size]
train_p = data['Language'][:train_size]

test = data['Text'][train_size:]
test_p = data['Language'][train_size:]

In [0]:
max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [0]:
tokenize.fit_on_texts(train) # only fit on train
x_train = tokenize.texts_to_matrix(train)
x_test = tokenize.texts_to_matrix(test)

In [0]:
encoder = LabelEncoder()
encoder.fit(train_p)
y_train = encoder.transform(train_p)
y_test = encoder.transform(test_p)

In [0]:
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [38]:
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (122804, 1000)
x_test shape: (30702, 1000)
y_train shape: (122804, 5)
y_test shape: (30702, 5)


In [39]:
batch_size = 32
epochs = 1
# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 110523 samples, validate on 12281 samples
Epoch 1/1


In [0]:
history

history


In [40]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.020060542720871922
Test accuracy: 0.9941046185799743


In [45]:
text_labels = encoder.classes_ 
for i in range(10):
    prediction = model.predict(np.array([x_test[i]]))
    predicted_label = text_labels[np.argmax(prediction)]
    print(test.iloc[i][:100])
    print('Actual label:' + test_p.iloc[i])
    print("Predicted label: " + predicted_label + "\n")

جعبتهم كقبر مفتوح كلهم جبابرة
Actual label:Arabic
Predicted label: Arabic

and the lord said unto moses speak unto the priests the sons of aaron and say unto them there shall 
Actual label:English
Predicted label: English

then herod when he had privily called the wise men inquired of them diligently what time the star ap
Actual label:English
Predicted label: English

d t is de reden u bent kinderen van de duivel en doet met plezier wat hij wil hij is al sinds het be
Actual label:Dutch
Predicted label: Dutch

للعلوقة بنتان هات هات ثلاثة لا تشبع اربعة لا تقول كفا
Actual label:Arabic
Predicted label: Arabic

耶和華 阿  求 你 記念 我 在 你 面前 怎 樣 存 完全 的 心  按 誠實 行事  又 作 你 眼中 所 看 為善 的  希西家 就 痛哭了
Actual label:Chinese
Predicted label: Chinese

men lærer lignelsen af figentræet når dets gren allerede er bleven blødog bladene skyde frem da skøn
Actual label:Danish
Predicted label: Danish

da lagde trællen sin hånd under sin herre abrahams lænd og svor ham eden
Actual label:Danish
Predicted label: Danish



**Let's test the model for Multiple Languages data**

In [0]:
data = np.array(['da lagde trællen sin hånd under sin herre abrahams lænd og svor ham eden 耶和華 阿  求 你 記念 我 在 你 面前'])
s = pd.Series(data)
val = tokenize.texts_to_matrix(s)
prediction = model.predict(np.array(val))
predicted_label = text_labels[np.argmax(prediction)]

In [141]:
prediction

array([[9.4595826e-14, 9.9866128e-01, 1.3387678e-03, 1.0105591e-10,
        6.0815852e-10]], dtype=float32)

In [139]:
text_labels[0], text_labels[1], text_labels[2], text_labels[3], text_labels[4], 

('Arabic', 'Chinese', 'Danish', 'Dutch', 'English')



1.   CHINESE = 99.8 %
2.   DANISH = 0.0013 %

OTHER PROBABILITIES ARE NEGLIGIBLE



In [145]:
data = np.array(['god is now here and he is good trællen'])
s = pd.Series(data)
val = tokenize.texts_to_matrix(s)
prediction = model.predict(np.array(val))
predicted_label = text_labels[np.argmax(prediction)]
print("LABEL",predicted_label)
print("PROBABILITIES",prediction)

LABEL English
PROBABILITIES [[3.5602514e-07 6.8739092e-07 2.6488141e-07 3.4640183e-05 9.9996400e-01]]


**I have just used one Dutch word and we can clearly see the probability of DUTCH is significant compared to other languages other than English**


1.   ENGLISH = 99%
2.   DUTCH =  0.000034 %





In [148]:
data = np.array(['god is now here trællen sin hånd under sin herre 耶和華 阿  求 你 記念 فقالت لها الكرمة أاترك مسطاري الذي يفرح الله والناس'])
s = pd.Series(data)
val = tokenize.texts_to_matrix(s)
prediction = model.predict(np.array(val))
predicted_label = text_labels[np.argmax(prediction)]
print("LABEL",predicted_label)
print("PROBABILITIES",prediction)

LABEL Chinese
PROBABILITIES [[1.4163397e-04 9.9834299e-01 4.1349628e-04 3.0543900e-04 7.9654140e-04]]


**CHINESE TEXT IS GIVEN THE HIGHEST PRIORITY AS IT HAS MORE CHARACTERS AND ALL OTHER LANGUAGES HAVE ALMOST EQUAL PROBABILITIES**