In [1]:
import numpy as np 
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import nltk
import tensorflow as tf
import keras
import seaborn as sns
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Text      22000 non-null  object
 1   language  22000 non-null  object
dtypes: object(2)
memory usage: 343.9+ KB


In [5]:
df["Text"] = df["Text"].str.lower()
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [6]:
max_words = 50000
max_len = 100

In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(list(df['Text']))
train_df = tokenizer.texts_to_sequences(list(df['Text']))

In [8]:
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df,maxlen = max_len)

In [9]:
train_df

array([[    0,     0,     0, ...,  2170,    82,  3638],
       [    0,     0,     0, ...,    16,  4068, 26452],
       [    0,     0,     0, ...,     0,     0, 16664],
       ...,
       [    0,     0,     0, ...,   118,  1435,   168],
       [    0,     0,     0, ...,    16,  2029,   471],
       [    0,     0,     0, ...,   178,  6523,     1]])

In [10]:
max_len

100

In [11]:
df.head()

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [12]:
len(tokenizer.word_index)

273967

In [13]:
Y = df['language']
Y

0        Estonian
1         Swedish
2            Thai
3           Tamil
4           Dutch
           ...   
21995      French
21996        Thai
21997     Spanish
21998     Chinese
21999    Romanian
Name: language, Length: 22000, dtype: object

In [14]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
le = preprocessing.LabelEncoder()

In [15]:
le.fit(Y)

LabelEncoder()

In [16]:
list(le.classes_)

['Arabic',
 'Chinese',
 'Dutch',
 'English',
 'Estonian',
 'French',
 'Hindi',
 'Indonesian',
 'Japanese',
 'Korean',
 'Latin',
 'Persian',
 'Portugese',
 'Pushto',
 'Romanian',
 'Russian',
 'Spanish',
 'Swedish',
 'Tamil',
 'Thai',
 'Turkish',
 'Urdu']

In [17]:
Y2 = le.fit_transform(Y)

In [18]:
total_languages = df['language'].nunique()

In [19]:
Y2 = keras.utils.to_categorical(Y2,num_classes=total_languages)

In [20]:
np.shape(Y2)

(22000, 22)

In [21]:
X_train,X_test,Y_train,Y_test = train_test_split(train_df,Y2)

In [22]:
embedding_dims = 500
vocab_size = len(tokenizer.word_index)+1

In [23]:
model = tf.keras.Sequential([tf.keras.layers.Embedding(input_dim = vocab_size,output_dim = embedding_dims,input_length = max_len),
                            tf.keras.layers.Flatten(),
                            tf.keras.layers.Dense(total_languages,activation=tf.nn.softmax)])

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 500)          136984000 
_________________________________________________________________
flatten (Flatten)            (None, 50000)             0         
_________________________________________________________________
dense (Dense)                (None, 22)                1100022   
Total params: 138,084,022
Trainable params: 138,084,022
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(optimizer ='adam',loss = 'categorical_crossentropy',metrics=['accuracy'])

In [26]:
model.fit(np.array(X_train),np.array(Y_train),epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1e59728f6c8>

In [27]:
model.evaluate(np.array(X_test),np.array(Y_test))



[0.18050016462802887, 0.9283636212348938]

In [28]:
print("English ",le.transform(['English']))
print("French ",le.transform(['French']))
print("Dutch ",le.transform(['Dutch']))
print("Swedish ",le.transform(['Swedish']))

English  [3]
French  [5]
Dutch  [2]
Swedish  [17]


In [29]:
text = ["När du väl känner till alla element är det inte svårt att ta ihop en mening."]

In [30]:
test_text = tokenizer.texts_to_sequences(text)
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=max_len)

In [31]:
predictions = model.predict(test_text)

In [32]:
out = predictions.argmax()
print(le.inverse_transform([out]))
print(predictions)

['Swedish']
[[0.00460986 0.00354061 0.01812505 0.00367499 0.10478529 0.04788649
  0.0022314  0.00525728 0.00312098 0.00927859 0.01942791 0.00288372
  0.00228522 0.00659773 0.00426855 0.00878443 0.0227633  0.6852943
  0.01182444 0.00749918 0.02020724 0.00565349]]


In [33]:
text = ["Als je eenmaal alle elementen kent, is het niet moeilijk om een zin samen te stellen."] 

In [34]:
test_text = tokenizer.texts_to_sequences(text)
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=max_len)
predictions = model.predict(test_text)
out = predictions.argmax()
print(le.inverse_transform([out]))
print(predictions)

['Dutch']
[[4.2824217e-04 3.9052330e-03 9.7590297e-01 2.0321617e-03 5.8098754e-04
  9.1462507e-04 4.2296271e-04 5.6428456e-04 1.1451823e-03 8.5905666e-04
  3.7567401e-03 5.5750256e-04 5.2631856e-04 6.9798296e-04 1.0926087e-03
  7.5387012e-04 8.8619842e-04 8.4890058e-04 1.0052517e-03 6.5756607e-04
  1.7267717e-03 7.3449896e-04]]


In [35]:
text =["Une fois que vous connaissez tous les éléments, il n'est pas difficile de rassembler une phrase."] 

In [36]:
test_text = tokenizer.texts_to_sequences(text)
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=max_len)
predictions = model.predict(test_text)
out = predictions.argmax()
print(le.inverse_transform([out]))
print(predictions)

['French']
[[1.7250060e-03 3.2590129e-03 6.3769654e-03 3.1836331e-04 3.2968489e-03
  9.1603339e-01 1.2083271e-03 1.5092584e-03 5.1418347e-03 1.3601506e-03
  2.6525250e-02 6.2732428e-04 4.3938202e-03 1.0846098e-03 6.8623493e-03
  1.6464377e-03 6.7546349e-03 1.2418219e-03 1.3037481e-03 4.1088136e-03
  2.9359031e-03 2.2859932e-03]]


In [37]:
text =["Una vez que conoces todos los elementos, no es difícil armar una oración."] 
test_text = tokenizer.texts_to_sequences(text)
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=max_len)
predictions = model.predict(test_text)
out = predictions.argmax()
print(le.inverse_transform([out]))
print(predictions)

['Spanish']
[[1.95185316e-03 3.96771915e-02 2.35758931e-03 2.61158799e-03
  3.93907679e-03 2.81071011e-03 8.33446102e-04 1.50642660e-03
  1.13486955e-02 2.19362974e-03 2.28924304e-02 1.18379109e-03
  1.81045700e-02 2.45337514e-03 1.75806996e-03 5.42546110e-03
  8.45849216e-01 1.31060544e-03 3.53968237e-03 2.05407124e-02
  2.78571108e-03 4.92619677e-03]]


In [38]:
def detect(text):
    test_text = tokenizer.texts_to_sequences(text)
    test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=max_len)
    predictions = model.predict(test_text)
    out = predictions.argmax()
    return le.inverse_transform([out])