# Language detection
## Data loading and learning the net

In [1]:
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow
from keras import models
from keras import layers
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/sentences_50k.csv", delimiter=",", encoding='utf8', index_col=0)
df.head()

Unnamed: 0,id,lan_code,sentence,unicode_length
0,1,cmn,我們試試看！,6
1,2,cmn,我该去睡觉了。,7
2,3,cmn,你在干什麼啊？,7
3,4,cmn,這是什麼啊？,6
4,5,cmn,今天是６月１８号，也是Muiriel的生日！,22


In [3]:
langs = list(df["lan_code"].unique())

In [4]:
lang_to_num = {lan: i for i, lan in enumerate(langs)}
print(lang_to_num)

num_to_lang = {i: lan for lan, i in lang_to_num.items()}
print(num_to_lang)

{'cmn': 0, 'deu': 1, 'rus': 2, 'fra': 3, 'eng': 4, 'jpn': 5, 'spa': 6, 'ita': 7, 'nld': 8, 'epo': 9, 'por': 10, 'tur': 11, 'heb': 12, 'hun': 13, 'fin': 14, 'ukr': 15, 'ces': 16, 'pol': 17, 'dan': 18, 'lit': 19, 'mkd': 20, 'kab': 21, 'ber': 22, 'mar': 23}
{0: 'cmn', 1: 'deu', 2: 'rus', 3: 'fra', 4: 'eng', 5: 'jpn', 6: 'spa', 7: 'ita', 8: 'nld', 9: 'epo', 10: 'por', 11: 'tur', 12: 'heb', 13: 'hun', 14: 'fin', 15: 'ukr', 16: 'ces', 17: 'pol', 18: 'dan', 19: 'lit', 20: 'mkd', 21: 'kab', 22: 'ber', 23: 'mar'}


In [5]:
df["lang_num"] = df["lan_code"].map(lang_to_num).astype(int)

In [6]:
import numpy as np

def sentence_to_unicode_list(sentence):
    max_len = 29
    unicode_list = [ord(i) for i in sentence]
    if len(unicode_list) >= max_len:
        unicode_list = unicode_list[:max_len]
    else:
        len_diff = max_len - len(unicode_list)
        unicode_list.extend([0 for _ in range(len_diff)])
    return np.asarray(unicode_list).astype(np.int32)

def set_labels(lang):
    labels = [0 for _ in range(lang_count)]
    labels[lang] = 1
    return np.asarray(labels).astype(np.int32)

lang_count = len(df['lang_num'].unique())

In [7]:
df['unicode'] = df['sentence'].apply(sentence_to_unicode_list)
df['labels'] = df['lang_num'].apply(set_labels)

In [8]:
df.dropna()
df.sample(n=(10))


Unnamed: 0,id,lan_code,sentence,unicode_length,lang_num,unicode,labels
5636310,5990778,por,Não tenho permissão para dizer isso.,36,10,"[78, 227, 111, 32, 116, 101, 110, 104, 111, 32...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."
1851069,1958596,ber,Tella seg-went tid ay yessawalen tajapunit?,43,22,"[84, 101, 108, 108, 97, 32, 115, 101, 103, 45,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8322083,8750266,ber,Asirem n ugdud yennejla ur d-yeǧǧi tansa.,41,22,"[65, 115, 105, 114, 101, 109, 32, 110, 32, 117...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1584477,1678365,nld,Ze leest graag.,15,8,"[90, 101, 32, 108, 101, 101, 115, 116, 32, 103...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4501849,4798953,deu,Ich mag Französisch wahnsinnig gern.,36,1,"[73, 99, 104, 32, 109, 97, 103, 32, 70, 114, 9...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2098655,2225239,spa,Ella tiene un problema al corazón.,34,6,"[69, 108, 108, 97, 32, 116, 105, 101, 110, 101...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5336403,5688079,fra,Je me demande quel est le poids moyen d'une fr...,51,3,"[74, 101, 32, 109, 101, 32, 100, 101, 109, 97,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3596968,3827414,fin,"On mahdollista, ettei Tom tiedä, että meillä o...",77,14,"[79, 110, 32, 109, 97, 104, 100, 111, 108, 108...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
4664492,4972794,eng,Tom told me that today.,23,4,"[84, 111, 109, 32, 116, 111, 108, 100, 32, 109...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9817602,10266728,fin,Olet sopivin henkilö tehtävään.,31,14,"[79, 108, 101, 116, 32, 115, 111, 112, 105, 11...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."


In [9]:
df.isnull().any()

id                False
lan_code          False
sentence          False
unicode_length    False
lang_num          False
unicode           False
labels            False
dtype: bool

In [10]:
x = np.array(df['unicode'])
y = np.array(df['labels'])


In [22]:
np.unique(np.array([type(i) for i in x]))

TypeError: '<' not supported between instances of 'type' and 'type'

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True)


In [18]:
model = models.Sequential()
model.add(layers.Dense(32, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(lang_count, activation='softmax'))
model.build(input_shape=(1, 29))
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (1, 32)                   960       
                                                                 
 dropout_4 (Dropout)         (1, 32)                   0         
                                                                 
 dense_6 (Dense)             (1, 64)                   2112      
                                                                 
 dropout_5 (Dropout)         (1, 64)                   0         
                                                                 
 dense_7 (Dense)             (1, 64)                   4160      
                                                                 
 dropout_6 (Dropout)         (1, 64)                   0         
                                                                 
 dense_8 (Dense)             (1, 64)                  

In [19]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [20]:
batch_size = 64
epochs = 30


hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).

In [37]:
sentences = df['sentence'].values

In [38]:
sentences

array(['我們試試看！', '我该去睡觉了。', '你在干什麼啊？', ..., 'Los botaron.',
       'Los hicieron correr.', 'Los corrieron.'], dtype=object)

In [39]:
sentences_lang_num = df["lang_num"].values.T[0]

In [40]:
sentences_lang_num.shape

()

In [41]:
del df