In [None]:
# imports
import os
import re
import random
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.losses import categorical_crossentropy
from keras.models import Sequential
from keras.layers import Dense
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
%matplotlib inline

In [None]:
# loading English-Kurdish dataset
Kurdish_df = pd.DataFrame(pd.read_csv('/Users/sivarazadi/Programming_Projects/Language_Detection/wordlist2.csv'))
Kurdish_df = Kurdish_df[12:] #deleting multiple 'A' translations to make things simpler 
print(Kurdish_df)

# loading Languages dataset
Languages_df = pd.DataFrame(pd.read_csv('/Users/sivarazadi/Programming_Projects/Language_Detection/Language Detection.csv'))
print(Languages_df)

|       | wordId  |english   |kurdish  |english_length  |kurdish_length|
| :---        |    :----:   |          ---: |    :----:   |          ---: |         ---: |
| 12          |  13         |    abandon    |    berdan    |         7.0    |         6.0|
|  13          |  14         |    abate      |    kêmkirin   |          5.0   |          8.0|
|  14          |  15         |    abide      |    ragirtin    |         5.0    |         8.0|
|  15          |  16         |    about      |    der barê     |        5.0     |        8.0|
|  16          |  17         |    above      |     jor          |   5.0          |   3.0|
|  ...         |  ...        |      ...           |     ...      |       ...      |       ...|
|  4324        |  4330       |     your           |  yên we       |      4.0       |     6.0|
|  4325        |  4331       |    youth           |  ciwanî        |     5.0        |     6.0|
|  4326        |  4332       |    youth           | xortanî         |    5.0         |    7.0|
| 4327        |  4333       |     zinc           |   tûtya          |   4.0          |   5.0|
|  4328        |  4334       |     zone           |   cîwar           |  4.0           |  5.0|

  [4317 rows x 5 columns] 


|       | Text | Language|
| :---        |    :----:   |          ---: |                                          
|0      | Nature, in the broadest sense, is the natural... | English|
|1      |"Nature" can refer to the phenomena of the phy...  |English|
|2      |The study of nature is a large, if not the onl...  |English|
|3      |Although humans are part of nature, human acti... | English|
|4      |[1] The word nature is borrowed from the Old F... | English|
|...    |                                              ...  |    ...|
|10332  |ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...  |Kannada|
|10333  |ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...  |Kannada|
|10334  |ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...|  Kannada|
|10335  |ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...  |Kannada|
|10336  |ಟೆರ್ರಿ ನೀವು ನಿಜವಾಗಿಯೂ ಆ ದೇವದೂತನಂತೆ ಸ್ವಲ್ಪ ಕಾಣು...  | Kannada|

[10337 rows x 2 columns]

In [None]:
# cleaning English-Kurdish dataset and added it to Languages dataset
Kurdish_df = Kurdish_df[['kurdish']]
Kurdish_df['Language'] = 'Kurdish'
Kurdish_df = Kurdish_df.rename(columns={'kurdish':'Text'})

Languages_df = pd.concat([Languages_df, Kurdish_df])
print(Languages_df)

|       | Text | Language|
| :---        |    :----:   |          ---: |                                          
|0     | Nature, in the broadest sense, is the natural...  |English|
|1     |"Nature" can refer to the phenomena of the phy...  |English|
|2     |The study of nature is a large, if not the onl... | English|
3     |Although humans are part of nature, human acti...  |English|
|4     |[1] The word nature is borrowed from the Old F...  |English|
|...    |                                             ...  |    ...|
|4324    |                                         yên we  |Kurdish|
|4325     |                                        ciwanî  |Kurdish|
|4326      |                                      xortanî  |Kurdish|
|4327       |                                       tûtya  |Kurdish|
|4328        |                                      cîwar  |Kurdish|

[14654 rows x 2 columns]

In [None]:
# shuffled dataset and split into X and y for trainging
shuffled_Language_df = Languages_df.sample(frac=1)
X = shuffled_Language_df['Text']
y = shuffled_Language_df['Language']

In [None]:
# looking at unique label values
print(y.unique())
print(len(y.unique()))

['English' 'Greek' 'Turkish' 'Spanish' 'Arabic' 'Dutch' 'Kurdish' <br/>
 'Portugeese' 'Sweedish' 'Kannada' 'Danish' 'French' 'Russian' 'Malayalam' <br/>
 'Tamil' 'Italian' 'German' 'Hindi'] <br/>
 
18

In [None]:
# encoding label data for training 
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [None]:
# getting rid of special characters and upper-case characters
X_list = []
for text in X:
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    X_list.append(text)


In [None]:
# vectorizing and splitting data
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(X_list).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01)

In [None]:
# making sure label data is categorical
y_train = to_categorical(y_train, 18)
y_test = to_categorical(y_test, 18)

In [None]:
# building model

model = Sequential()

model.add(Dense(128, activation='relu', input_shape=( None,41610)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(18, activation='softmax'))


model.compile(optimizer='adam', loss=categorical_crossentropy, metrics=['accuracy'])

model.summary()

Model: "sequential" <br/>
|      Layer (type) | Output Shape  | Param #|
| :---        |    :----   |          ---: |                                          
| dense (Dense)     |          (None, None, 128)    |     5326208   |                                                            
| dense_1 (Dense) |            (None, None, 128)    |     16512     |                                                                
| dense_2 (Dense)  |           (None, None, 64)     |     8256      |                                                                
| dense_3 (Dense)   |          (None, None, 18)     |     1170      |
|Total params: 5,352,146|
|Trainable params: 5,352,146|
|Non-trainable params: 0|


In [None]:
# checkpoint for best training epoch
checkpoint_filepath = "weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)

callbacks_list = [model_checkpoint_callback]

In [None]:
# training model with validation
history = model.fit(x=X_train, y=y_train, epochs=5, validation_data=[X_test, y_test], callbacks=callbacks_list)

Epoch 1/5 <br/>

454/454 [==============================] - 15s 32ms/step - loss: 0.5743 - accuracy: 0.8711 - val_loss: 0.1396 - val_accuracy: 0.9592 <br/>

Epoch 2/5 <br/>

454/454 [==============================] - 14s 30ms/step - loss: 0.0518 - accuracy: 0.9887 - val_loss: 0.1892 - val_accuracy: 0.9728 <br/>

Epoch 3/5 <br/>

454/454 [==============================] - 14s 31ms/step - loss: 0.0143 - accuracy: 0.9973 - val_loss: 0.1948 - val_accuracy: 0.9796 <br/>

Epoch 4/5 <br/>

454/454 [==============================] - 13s 30ms/step - loss: 0.0105 - accuracy: 0.9978 - val_loss: 0.2121 - val_accuracy: 0.9796 <br/>

Epoch 5/5 <br/>

454/454 [==============================] - 13s 29ms/step - loss: 0.0086 - accuracy: 0.9977 - val_loss: 0.3963 - val_accuracy: 0.8980 <br/>

In [None]:
# plotting acc

def plot_accuracy(acc, val_acc):
    plt.figure()
    plt.plot(acc)
    plt.plot(val_acc)
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show() 

plot_accuracy(history.history['accuracy'], history.history['val_accuracy'])

<img src="./Accuracy_Plot.png" style="width:600px;height:400px;">

In [None]:
best_score = max(history.history['val_accuracy'])
print( 'best val_accuracy score: ' + str(best_score))

best val_accuracy score: 0.9795918464660645

In [None]:
model.load_weights(filepath='weights-improvement-03-0.98.hdf5')

In [None]:
def predict(text):
     x = vectorizer.transform([text]).toarray() # converting text to bag of words model (Vector)
     lang = model.predict(x) # predicting the language
     lang = encoder.inverse_transform(lang) # finding the language corresponding the the predicted value
     print("The langauge is in",lang[0]) # printing the language

In [None]:
predict('yes')

The langauge is in English

In [None]:
predict('welgemanierd heeft het woord')

The langauge is in Dutch