# Language detection
## Data loading and learning the net

In [1]:
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow
from keras import models
from keras import layers
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/sentences_50k.csv", delimiter=",", encoding='utf8', index_col=0)
df.head()

Unnamed: 0,id,lan_code,sentence,unicode_length
0,1,cmn,我們試試看！,6
1,2,cmn,我该去睡觉了。,7
2,3,cmn,你在干什麼啊？,7
3,4,cmn,這是什麼啊？,6
4,5,cmn,今天是６月１８号，也是Muiriel的生日！,22


In [3]:
langs = list(df["lan_code"].unique())

In [4]:
lang_to_num = {lan: i for i, lan in enumerate(langs)}
print(lang_to_num)

num_to_lang = {i: lan for lan, i in lang_to_num.items()}
print(num_to_lang)

{'cmn': 0, 'deu': 1, 'rus': 2, 'fra': 3, 'eng': 4, 'jpn': 5, 'spa': 6, 'ita': 7, 'nld': 8, 'epo': 9, 'por': 10, 'tur': 11, 'heb': 12, 'hun': 13, 'fin': 14, 'ukr': 15, 'ces': 16, 'pol': 17, 'dan': 18, 'lit': 19, 'mkd': 20, 'kab': 21, 'ber': 22, 'mar': 23}
{0: 'cmn', 1: 'deu', 2: 'rus', 3: 'fra', 4: 'eng', 5: 'jpn', 6: 'spa', 7: 'ita', 8: 'nld', 9: 'epo', 10: 'por', 11: 'tur', 12: 'heb', 13: 'hun', 14: 'fin', 15: 'ukr', 16: 'ces', 17: 'pol', 18: 'dan', 19: 'lit', 20: 'mkd', 21: 'kab', 22: 'ber', 23: 'mar'}


In [5]:
df["lang_num"] = df["lan_code"].map(lang_to_num).astype(int)

In [9]:
import numpy as np

def sentence_to_unicode_list(sentence):
    max_len = 29
    unicode_list = [ord(i) for i in sentence]
    if len(unicode_list) >= max_len:
        unicode_list = unicode_list[:max_len]
    else:
        len_diff = max_len - len(unicode_list)
        unicode_list.extend([0 for _ in range(len_diff)])
    return np.array(unicode_list).astype(np.int32)

df['unicode'] = df['sentence'].apply(sentence_to_unicode_list)


In [10]:
def set_labels(lang):
    labels = [0 for _ in range(lang_count)]
    labels[lang] = 1
    return np.array(labels)

lang_count = len(df['lang_num'].unique())
df['labels'] = df['lang_num'].apply(set_labels).astype(np.int32)

In [17]:
df.dropna()
df.sample(n=(10))


Unnamed: 0,id,lan_code,sentence,unicode_length,lang_num,unicode,labels
6002273,6380486,epo,Mi forveturos al Parizo je la deka horo.,40,9,"[77, 105, 32, 102, 111, 114, 118, 101, 116, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3360768,3579131,ita,Tu sei sgarbata.,16,7,"[84, 117, 32, 115, 101, 105, 32, 115, 103, 97,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
8915621,9353644,kab,Ad teddum ɣer Tinzawatin.,25,21,"[65, 100, 32, 116, 101, 100, 100, 117, 109, 32...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4871815,5198643,tur,O onunla yakındır.,18,11,"[79, 32, 111, 110, 117, 110, 108, 97, 32, 121,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
7710196,8121176,tur,Sohbet ettik.,13,11,"[83, 111, 104, 98, 101, 116, 32, 101, 116, 116...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3012273,3198500,rus,Какой они выберут?,18,2,"[1050, 1072, 1082, 1086, 1081, 32, 1086, 1085,...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4216257,4489940,deu,Ich werde sie morgen anrufen.,29,1,"[73, 99, 104, 32, 119, 101, 114, 100, 101, 32,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1823583,1929170,tur,Raporla ilgili yorumlarınızı sabırsızlıkla bek...,54,11,"[82, 97, 112, 111, 114, 108, 97, 32, 105, 108,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1411650,1493637,tur,Ben burada kalmaya karar verdim.,32,11,"[66, 101, 110, 32, 98, 117, 114, 97, 100, 97, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
2667243,2827497,ita,Ripetetemelo.,13,7,"[82, 105, 112, 101, 116, 101, 116, 101, 109, 1...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [22]:
x_train, x_test, y_train, y_test = train_test_split(df["unicode"], df["lang_num"], test_size=0.2, shuffle=True)

x_train=np.asarray(x_train).astype(np.int)
y_train=np.asarray(y_train).astype(np.int)
x_test=np.asarray(x_test).astype(np.int)
y_test=np.asarray(y_test).astype(np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  x_train=np.asarray(x_train).astype(np.int)


In [19]:

lang_count = len(df['lang_num'].unique())
print(lang_count)

model = models.Sequential()
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(lang_count, activation='softmax'))
model.build(input_shape=(1, 29))
model.summary()

24
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (1, 32)                   960       
                                                                 
 dropout_4 (Dropout)         (1, 32)                   0         
                                                                 
 dense_6 (Dense)             (1, 64)                   2112      
                                                                 
 dropout_5 (Dropout)         (1, 64)                   0         
                                                                 
 dense_7 (Dense)             (1, 64)                   4160      
                                                                 
 dropout_6 (Dropout)         (1, 64)                   0         
                                                                 
 dense_8 (Dense)             (1, 64)               

In [20]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
batch_size = 64
epochs = 30


hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [37]:
sentences = df['sentence'].values

In [38]:
sentences

array(['我們試試看！', '我该去睡觉了。', '你在干什麼啊？', ..., 'Los botaron.',
       'Los hicieron correr.', 'Los corrieron.'], dtype=object)

In [39]:
sentences_lang_num = df["lang_num"].values.T[0]

In [40]:
sentences_lang_num.shape

()

In [41]:
del df