In [1]:
import pandas as pd
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, GRU, Dense, Dropout
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
import joblib




In [2]:
df = pd.read_csv("Language Detection.csv")
df

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English
...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada


In [3]:
df['Language'].unique()

array(['English', 'Malayalam', 'Hindi', 'Tamil', 'Portugeese', 'French',
       'Dutch', 'Spanish', 'Greek', 'Russian', 'Danish', 'Italian',
       'Turkish', 'Sweedish', 'Arabic', 'German', 'Kannada'], dtype=object)

In [4]:
df['Text'] = df['Text'].str.lower()

In [5]:
def preprocess_multilingual_text(text):
    text = re.sub(r'[^\w\s\u0900-\u097F\u0D00-\u0D7F\u0B80-\u0BFF\u0600-\u06FF]', '', text)
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_text'] = df['Text'].apply(preprocess_multilingual_text)

In [6]:
df

Unnamed: 0,Text,Language,cleaned_text
0,"nature, in the broadest sense, is the natural...",English,nature in the broadest sense is the natural ph...
1,"""nature"" can refer to the phenomena of the phy...",English,nature can refer to the phenomena of the physi...
2,"the study of nature is a large, if not the onl...",English,the study of nature is a large if not the only...
3,"although humans are part of nature, human acti...",English,although humans are part of nature human activ...
4,[1] the word nature is borrowed from the old f...,English,1 the word nature is borrowed from the old fre...
...,...,...,...
10332,ನಿಮ್ಮ ತಪ್ಪು ಏನು ಬಂದಿದೆಯೆಂದರೆ ಆ ದಿನದಿಂದ ನಿಮಗೆ ಒ...,Kannada,ನಮಮ ತಪಪ ಏನ ಬದದಯದರ ಆ ದನದದ ನಮಗ ಒಳಳಯದನನ ನಡಣ
10333,ನಾರ್ಸಿಸಾ ತಾನು ಮೊದಲಿಗೆ ಹೆಣಗಾಡುತ್ತಿದ್ದ ಮಾರ್ಗಗಳನ್...,Kannada,ನರಸಸ ತನ ಮದಲಗ ಹಣಗಡತತದದ ಮರಗಗಳನನ ಬದಲಯಸದಳ ಆದರ ನಧನವ...
10334,ಹೇಗೆ ' ನಾರ್ಸಿಸಿಸಮ್ ಈಗ ಮರಿಯನ್ ಅವರಿಗೆ ಸಂಭವಿಸಿದ ಎ...,Kannada,ಹಗ ನರಸಸಸಮ ಈಗ ಮರಯನ ಅವರಗ ಸಭವಸದ ಎಲಲವನನ ಹಳದ ಮತತ ಅವ...
10335,ಅವಳು ಈಗ ಹೆಚ್ಚು ಚಿನ್ನದ ಬ್ರೆಡ್ ಬಯಸುವುದಿಲ್ಲ ಎಂದು ...,Kannada,ಅವಳ ಈಗ ಹಚಚ ಚನನದ ಬರಡ ಬಯಸವದಲಲ ಎದ ನನ ess ಹಸದದನ


In [7]:
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(df['cleaned_text'])

sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X_padded = pad_sequences(sequences, maxlen=200, padding='post')

In [8]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

# GRU Model
gru_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=200),
    Bidirectional(GRU(64, return_sequences=True)),
    Dropout(0.3),
    GRU(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(17, activation='softmax')  # 17 languages
])

gru_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
gru_model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          53632     
                                                                 
 bidirectional (Bidirection  (None, 200, 128)          74496     
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 200, 128)          0         
                                                                 
 gru_1 (GRU)                 (None, 64)                37248     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense (Dense)               (None, 32)                2080      
                                                      

In [9]:
# TF-IDF Features
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 3), max_features=5000)
X_tfidf = vectorizer.fit_transform(df['cleaned_text']).toarray()

# MLP Model
mlp_model = Sequential([
    Dense(512, activation='relu', input_shape=(X_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(256, activation='relu'),
    Dropout(0.3),
    Dense(17, activation='softmax')  # 17 languages
])

mlp_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
mlp_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_2 (Dense)             (None, 512)               2560512   
                                                                 
 dropout_2 (Dropout)         (None, 512)               0         
                                                                 
 dense_3 (Dense)             (None, 256)               131328    
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_4 (Dense)             (None, 17)                4369      
                                                                 
Total params: 2696209 (10.29 MB)
Trainable params: 2696209 (10.29 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [11]:
# Ensure GPU usage
print("Training on GPU" if tf.config.list_physical_devices('GPU') else "Training on CPU")

# Encode the target labels
y = df['Language'].factorize()[0]  # Label encode

# Split data for GRU
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define and Compile GRU Model
gru_model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    Bidirectional(GRU(64, return_sequences=True)),
    Dropout(0.3),
    GRU(64),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(17, activation='softmax')
])
gru_model.compile(optimizer='adam',
                  loss=SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

# Save GRU model after training
gru_checkpoint = ModelCheckpoint("gru_model.h5", save_best_only=True, monitor="val_accuracy", mode="max")
gru_history = gru_model.fit(X_train, y_train, validation_data=(X_test, y_test),
                            epochs=10, batch_size=32, callbacks=[gru_checkpoint])

# Print GRU Accuracy
gru_accuracy = gru_model.evaluate(X_test, y_test, verbose=0)[1]
print(f"GRU Test Accuracy: {gru_accuracy * 100:.2f}%")

# Train and Save MLP Model
from sklearn.neural_network import MLPClassifier

# Split data for MLP
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define and Train MLP Classifier
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=20, random_state=42)
mlp_model.fit(X_train_tfidf, y_train)

# Save the MLP model
joblib.dump(mlp_model, 'mlp_model.pkl')

# Print MLP Accuracy
mlp_accuracy = mlp_model.score(X_test_tfidf, y_test)
print(f"MLP Test Accuracy: {mlp_accuracy * 100:.2f}%")


Training on CPU
Epoch 1/10



  output, from_logits = _get_logits(



Epoch 2/10
  2/259 [..............................] - ETA: 18s - loss: 2.2937 - accuracy: 0.1875

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
GRU Test Accuracy: 90.52%
MLP Test Accuracy: 98.45%


