In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/lingualsense/merged_dataset.csv


In [2]:
df = pd.read_csv('/kaggle/input/lingualsense/merged_dataset.csv')
print(df.head())

                                                Text  Language
0  klement gottwaldi surnukeha palsameeriti ning ...  Estonian
1  sebes joseph pereira thomas  på eng the jesuit...   Swedish
2  ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...      Thai
3  விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...     Tamil
4  de spons behoort tot het geslacht haliclona en...     Dutch


In [3]:
df.rename(columns={'Text': 'text', 'Language': 'language'}, inplace=True)

df.dropna(subset=['text', 'language'], inplace=True)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['language_encoded'] = label_encoder.fit_transform(df['language'])

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['language_encoded'], test_size=0.2, random_state=42
)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding, Dropout

# Define the model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(label_encoder.classes_), activation='softmax')
])


model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [7]:
model.fit(X_train_tfidf, y_train, validation_data=(X_test_tfidf, y_test), epochs=10, batch_size=32)

Epoch 1/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.6325 - loss: 1.6653 - val_accuracy: 0.9266 - val_loss: 0.2740
Epoch 2/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9325 - loss: 0.2638 - val_accuracy: 0.9298 - val_loss: 0.2503
Epoch 3/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9394 - loss: 0.2161 - val_accuracy: 0.9309 - val_loss: 0.2506
Epoch 4/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9463 - loss: 0.1784 - val_accuracy: 0.9300 - val_loss: 0.2475
Epoch 5/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9499 - loss: 0.1689 - val_accuracy: 0.9258 - val_loss: 0.2519
Epoch 6/10
[1m809/809[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9511 - loss: 0.1589 - val_accuracy: 0.9289 - val_loss: 0.2672
Epoch 7/10
[1m809/809[0m 

<keras.src.callbacks.history.History at 0x7bb429d550f0>

In [8]:
import pickle

# Save the model
model.save('language_detection_gru.h5')

# Save the TF-IDF vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

# Save the LabelEncoder
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)