## Load The Dataset

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import tensorflow as tf

# Load the dataset (replace with actual dataset file)
df = pd.read_csv('/content/Languages.csv')

# Check the first few rows
print(df.head())

                                                text  language
0      কাৰাবন্দীৰ পৰা গৃহবন্দী হৈ উভতিল আং ছান ছু কী  Assamese
1  চীন কোৰিয়া ইণ্ডোনেছিয়াৰ পৰা আমদানি কৰা অপটিক...  Assamese
2  গ্ৰেপ্তাৰ হল ইমৰান খানদুৰ্নীতিৰ গোচৰত তিনিবছৰৰ...  Assamese
3  এইবাৰ পাকিস্তানৰ যুৱকৰ সৈতে বিয়াত বহিল দুই সন...  Assamese
4  নিষেধাজ্ঞা নেওচি ভাৰতলৈ নেপালৰ বিলাহীঘোচ লৈ চা...  Assamese


##  Preprocessing

In [20]:
df['text'] = df['text'].astype(str)
df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ', regex=True).str.lower()

# Encode language labels into numerical form
lang_map = {lang: idx for idx, lang in enumerate(df['language'].unique())}
df['language'] = df['language'].map(lang_map)

# Tokenize the text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['text'])

# Pad sequences for uniform input size
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post')

print(df['language'].shape)
print(padded_sequences.shape)

(1559993,)
(1559993, 100)


In [21]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['language'], test_size=0.2, random_state=42)

# Convert labels to categorical format (One-hot encoding)
y_train = tf.keras.utils.to_categorical(y_train, num_classes=len(lang_map))
y_test = tf.keras.utils.to_categorical(y_test, num_classes=len(lang_map))

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Parameters
embedding_dim = 128  # Dimensionality of the embedding layer
lstm_units = 128     # Number of units in LSTM layer
num_classes = len(lang_map)  # Number of unique languages

# Create a new model instance
model = Sequential()

# Define the embedding layer
model.add(Embedding(input_dim=10000, output_dim=embedding_dim, input_length=100))

# Add LSTM layer
model.add(LSTM(lstm_units))

# Add dropout for regularization
model.add(Dropout(0.3))

# Add output layer with softmax activation
model.add(Dense(num_classes, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model.summary()

In [23]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m 3921/39000[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:49:50[0m 188ms/step - accuracy: 0.5425 - loss: 1.3482

KeyboardInterrupt: 

In [None]:
# Evaluate the model on the test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_accuracy:.4f}')

In [None]:
# Save the trained model
model.save("language_detection_lstm_model.h5")