In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
data= pd.read_csv('SA_official_languages.csv')

In [4]:
data

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [5]:
data['lang_id'].value_counts()


lang_id
xho    3000
eng    3000
nso    3000
ven    3000
tsn    3000
nbl    3000
zul    3000
ssw    3000
tso    3000
sot    3000
afr    3000
Name: count, dtype: int64

In [6]:

texts = data['text'].values
labels = data['lang_id'].values

In [7]:
texts

array(['umgaqo-siseko wenza amalungiselelo kumaziko axhasa ulawulo lwesininzi kunye nokuthath inxaxheba kwabafazi ezi ziquka phakathi kwezinye zazo ikomishoni yokulingana ngokwesini ikomishoni yamalungelo oluntu lomzantsi afrika',
       'i-dha iya kuba nobulumko bokubeka umsebenzi naphi na kwisebe ngokusekwe kwiimfuno zokusebenza zalo emva kokubonana nomsebenzi kunye okanye imanyano yakhe ukuba ulandulo lomntu onjalo alufanelekanga i-dha mayibize uncedo olufanelekileyo elungelweni layo',
       'the province of kwazulu-natal department of transport invites tenders from established contractors experienced in bridge construction for the construction of the kwajolwayo tugela river pedestrian bridge near tugela ferry the duration of the project will be months',
       ...,
       'closing date for the submission of completed tenders is august at h no late submissions will be considered submissions must be enclosed in a sealed envelope and addressed to the chief executive officer at the ab

In [8]:
labels

array(['xho', 'xho', 'eng', ..., 'eng', 'xho', 'sot'], dtype=object)

In [9]:

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index


In [10]:
word_index

{'ya': 1,
 'a': 2,
 'le': 3,
 'ka': 4,
 'go': 5,
 'e': 6,
 'na': 7,
 'u': 8,
 'die': 9,
 'wa': 10,
 'the': 11,
 'ba': 12,
 'o': 13,
 'ho': 14,
 'ku': 15,
 'of': 16,
 'nga': 17,
 'ke': 18,
 'in': 19,
 'di': 20,
 'sa': 21,
 'ha': 22,
 'tsa': 23,
 'se': 24,
 'van': 25,
 'vha': 26,
 'mo': 27,
 'to': 28,
 'hi': 29,
 'and': 30,
 'la': 31,
 'is': 32,
 'en': 33,
 'tse': 34,
 'n': 35,
 'i': 36,
 'kha': 37,
 'ga': 38,
 'tša': 39,
 'eka': 40,
 'gore': 41,
 'yo': 42,
 'afrika': 43,
 'tla': 44,
 'kanye': 45,
 'va': 46,
 'te': 47,
 'xa': 48,
 'ta': 49,
 're': 50,
 'ukuba': 51,
 'wat': 52,
 'bona': 53,
 'noma': 54,
 'word': 55,
 'as': 56,
 'mme': 57,
 'be': 58,
 'kana': 59,
 'om': 60,
 'ye': 61,
 'or': 62,
 'hore': 63,
 'dza': 64,
 'by': 65,
 'tshi': 66,
 'nie': 67,
 'fa': 68,
 'for': 69,
 'swi': 70,
 'uma': 71,
 'ri': 72,
 'uri': 73,
 'bo': 74,
 'molao': 75,
 'zwa': 76,
 'kapa': 77,
 'kumbe': 78,
 'kutsi': 79,
 'ge': 80,
 'vir': 81,
 'okanye': 82,
 'goba': 83,
 'yi': 84,
 'that': 85,
 'kwa': 86,
 'l

In [11]:
# Mapping from dataset language IDs to Google Translator language codes
language_code_map = {
    'xho': 'xh',
    'eng': 'en',
    'nso': 'nso',  # Northern Sotho is the same as 'nso' in Google Translator
    'ven': 've',   # Venda
    'sot': 'st',   # Southern Sotho
    'tsn': 'tn',   # Tswana
    'afr': 'af',   # Afrikaans
    'zul': 'zu',   # Zulu
    'ssw': 'ss',   # Swati
    'tso': 'ts',   # Tsonga
    # Add other mappings as needed
}

# Applying the mapping to the dataset
def map_language_id(lang_id):
    return language_code_map.get(lang_id, lang_id)  # Default to the original if not found

# Apply the mapping
data['lang_id'] = data['lang_id'].apply(map_language_id)

# Print the updated DataFrame
print(data)

      lang_id                                               text
0          xh  umgaqo-siseko wenza amalungiselelo kumaziko ax...
1          xh  i-dha iya kuba nobulumko bokubeka umsebenzi na...
2          en  the province of kwazulu-natal department of tr...
3         nso  o netefatša gore o ba file dilo ka moka tše le...
4          ve  khomishini ya ndinganyiso ya mbeu yo ewa maana...
...       ...                                                ...
32995      tn  popo ya dipolateforomo tse ke go tlisa boetele...
32996      st  modise mosadi na o ntse o sa utlwe hore thaban...
32997      en  closing date for the submission of completed t...
32998      xh  nawuphina umntu ofunyenwe enetyala phantsi kwa...
32999      st  mafapha a mang le ona a lokela ho etsa ditlale...

[33000 rows x 2 columns]


In [12]:

# Pad sequences
maxlen = 100   
data = pad_sequences(sequences, maxlen=maxlen)

# Convert labels to numerical format
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_sequences = np.array(label_tokenizer.texts_to_sequences(labels)) - 1

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, label_sequences, test_size=0.2, random_state=42)

In [13]:
y_test[7]

array([0])

In [14]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, BatchNormalization, Dropout
from tensorflow.keras.models import Sequential

# Define model
model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=128, input_length=maxlen))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(len(label_tokenizer.word_index), activation='softmax'))

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=50, batch_size=40, validation_split=0.2)


Epoch 1/50




[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 165ms/step - accuracy: 0.7215 - loss: 0.9239 - val_accuracy: 0.9805 - val_loss: 0.0632
Epoch 2/50
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 176ms/step - accuracy: 0.9816 - loss: 0.0740 - val_accuracy: 0.9900 - val_loss: 0.0293
Epoch 3/50
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 174ms/step - accuracy: 0.9952 - loss: 0.0195 - val_accuracy: 0.9941 - val_loss: 0.0199
Epoch 4/50
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 176ms/step - accuracy: 0.9985 - loss: 0.0072 - val_accuracy: 0.9943 - val_loss: 0.0212
Epoch 5/50
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 177ms/step - accuracy: 0.9994 - loss: 0.0046 - val_accuracy: 0.9873 - val_loss: 0.0400
Epoch 6/50
[1m528/528[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 179ms/step - accuracy: 0.9989 - loss: 0.0070 - val_accuracy: 0.9854 - val_loss: 0.0705
Epoch 7/50
[1m528/52

In [20]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')


[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.9901 - loss: 0.0877
Loss: 0.06670039147138596
Accuracy: 0.9913636445999146


In [24]:
!pip install keras
!pip install tensorflow




In [26]:
import numpy as np
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assuming 'model', 'tokenizer', and 'label_tokenizer' have been defined and loaded properly
# Assuming maxlen is defined based on the training data

# Mapping from the predicted labels to Google Translator language codes
language_code_map = {
    'xho': 'xh',
    'eng': 'en',
    'nso': 'nso',
    'ven': 've',
    'sot': 'st',
    'tsn': 'tn',
    'afr': 'af',
    'zul': 'zu',
    'ssw': 'ss',
    'tso': 'ts',
    # Add any additional mappings as necessary
}

# Function to predict the language of a given text
def predict_language(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=maxlen)
    prediction = model.predict(padded_sequence)
    label = label_tokenizer.index_word[np.argmax(prediction) + 1]
    # Map to Google Translator language code
    google_lang_code = language_code_map.get(label, label)
    return google_lang_code

# Example prediction
print(predict_language("umgaqo-siseko wenza amalungiselelo kumaziko"))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
xh


In [27]:
3.4042077e-07

3.4042077e-07

In [28]:
# Save the model in HDF5 format
model.save('language_detection_model.h5')




In [19]:
import pickle

# Save the text tokenizer
with open('text_tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label tokenizer
with open('label_tokenizer.pkl', 'wb') as handle:
    pickle.dump(label_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
