In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv("/content/Language Detection.csv.zip")
X = df["Text"]
y = df["Language"]

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)

# Tokenization
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_encoded, test_size=0.2, random_state=42)

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

lstm_model = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))



Epoch 1/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.2907 - loss: 2.4358 - val_accuracy: 0.8013 - val_loss: 1.0674
Epoch 2/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.8502 - loss: 0.7657 - val_accuracy: 0.9313 - val_loss: 0.3250
Epoch 3/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9505 - loss: 0.2508 - val_accuracy: 0.9512 - val_loss: 0.2204
Epoch 4/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9674 - loss: 0.1559 - val_accuracy: 0.9608 - val_loss: 0.1684
Epoch 5/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9747 - loss: 0.1095 - val_accuracy: 0.9613 - val_loss: 0.1575


<keras.src.callbacks.history.History at 0x78b2d6973410>

In [10]:
from tensorflow.keras.layers import Input, RepeatVector
from tensorflow.keras.models import Model

# Encoder
input_layer = Input(shape=(max_len,))
encoded = Embedding(max_words, 64)(input_layer)
encoded = LSTM(32)(encoded)

# Decoder (for reconstruction training)
decoded = RepeatVector(max_len)(encoded)
decoded = LSTM(64, return_sequences=True)(decoded)
decoded = Dense(max_words, activation='softmax')(decoded)

autoencoder = Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Classifier using the 'encoded' bottleneck
classifier_output = Dense(num_classes, activation='softmax')(encoded)
ae_classifier = Model(input_layer, classifier_output)
ae_classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

ae_classifier.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.2400 - loss: 2.5602 - val_accuracy: 0.6596 - val_loss: 1.3550
Epoch 2/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.7791 - loss: 1.0689 - val_accuracy: 0.9038 - val_loss: 0.5594
Epoch 3/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9318 - loss: 0.4379 - val_accuracy: 0.9396 - val_loss: 0.3241
Epoch 4/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9580 - loss: 0.2729 - val_accuracy: 0.9550 - val_loss: 0.2367
Epoch 5/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9734 - loss: 0.1758 - val_accuracy: 0.9565 - val_loss: 0.2036


<keras.src.callbacks.history.History at 0x78b2ccb30d70>

In [12]:
from tensorflow.keras import layers, Sequential, Model
import tensorflow as tf

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim)
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    # FIX: Added training=None as a default parameter
    def call(self, inputs, training=None):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# ViT Architecture for Text
embed_dim = 128
num_heads = 4
ff_dim = 128
max_len = 100 # Ensure this matches your preprocessing
max_words = 10000

inputs = layers.Input(shape=(max_len,))
x = layers.Embedding(input_dim=max_words, output_dim=embed_dim)(inputs)

# Positional Encoding
positions = tf.range(start=0, limit=max_len, delta=1)
pos_encoding = layers.Embedding(input_dim=max_len, output_dim=embed_dim)(positions)
x = x + pos_encoding

# Transformer Layers - Now this will work without the TypeError
x = TransformerBlock(embed_dim, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

vit_model = Model(inputs=inputs, outputs=outputs)
vit_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])


vit_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

print("Starting training...")
history = vit_model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=64,
    validation_data=(X_test, y_test),
    verbose=1
)
print("Training complete!")

Starting training...
Epoch 1/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 61ms/step - accuracy: 0.2757 - loss: 2.5175 - val_accuracy: 0.9420 - val_loss: 0.2239
Epoch 2/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.9584 - loss: 0.1521 - val_accuracy: 0.9632 - val_loss: 0.1339
Epoch 3/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.9721 - loss: 0.0918 - val_accuracy: 0.9618 - val_loss: 0.1421
Epoch 4/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.9761 - loss: 0.0781 - val_accuracy: 0.9623 - val_loss: 0.1402
Epoch 5/5
[1m130/130[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.9763 - loss: 0.0748 - val_accuracy: 0.9676 - val_loss: 0.1242
Training complete!


In [13]:
def predict_language(model, text):
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)


    prediction = model.predict(padded)


    class_index = np.argmax(prediction)
    language = le.inverse_transform([class_index])[0]

    return language

# --- Testing Examples ---
test_text = "Au revoir"

print(f"LSTM Prediction: {predict_language(lstm_model, test_text)}")
print(f"Autoencoder Prediction: {predict_language(ae_classifier, test_text)}")
print(f"ViT Prediction: {predict_language(vit_model, test_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
LSTM Prediction: French
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step
Autoencoder Prediction: French
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
ViT Prediction: French
