In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# prompt: install keras

!pip install keras



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
import re

# 1. Load and preprocess the dataset
df = pd.read_csv("/content/drive/MyDrive/AI/Error Annotated Corpus.csv")

# Use relevant columns and clean missing values
df_cleaned = df[['Error word & consecutive word', 'Annotation']].dropna()
df_cleaned.columns = ['text', 'label']

# Clean Tamil text
def clean_text(text):
    text = re.sub(r'[^\u0B80-\u0BFF\s]', '', text)  # Keep only Tamil characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df_cleaned['text'] = df_cleaned['text'].apply(clean_text)

texts = df_cleaned['text'].values
labels = df_cleaned['label'].values

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Tokenize Tamil text with subword tokenization (using SentencePiece)
import sentencepiece as spm

# Train a SentencePiece model if not already trained
with open("tamil_text.txt", "w") as f:
    f.write("\n".join(texts))

spm.SentencePieceTrainer.train(input='tamil_text.txt', model_prefix='tamil', vocab_size=5000)
tokenizer = spm.SentencePieceProcessor(model_file='tamil.model')

# Convert text to sequences
sequences = [tokenizer.encode_as_ids(text) for text in texts]

# Set a fixed max length for padding
max_length = 100
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_length, padding='post')

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# 2. Define the model
vocab_size = tokenizer.vocab_size()

input_layer = Input(shape=(max_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=256)(input_layer)
lstm_layer = Bidirectional(LSTM(128, return_sequences=False))(embedding_layer)
dropout_layer = Dropout(0.5)(lstm_layer)
output_layer = Dense(len(label_encoder.classes_), activation="softmax")(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# 3. Train the model with early stopping and learning rate scheduler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

model.summary()
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=[early_stop, reduce_lr]
)

# 4. Evaluate the model
y_pred = np.argmax(model.predict(X_test), axis=-1)

# Get unique labels in y_test
unique_classes_in_test = np.unique(y_test)

# Generate the classification report with only the present classes
print("Classification Report:")
print(classification_report(
    y_test,
    y_pred,
    labels=unique_classes_in_test,  # Use only labels present in y_test
    target_names=[label_encoder.classes_[i] for i in unique_classes_in_test]
))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred, labels=unique_classes_in_test))



Epoch 1/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 470ms/step - accuracy: 0.2535 - loss: 2.4827 - val_accuracy: 0.2749 - val_loss: 2.1467 - learning_rate: 0.0010
Epoch 2/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 466ms/step - accuracy: 0.3367 - loss: 2.1395 - val_accuracy: 0.4129 - val_loss: 1.8439 - learning_rate: 0.0010
Epoch 3/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 460ms/step - accuracy: 0.6323 - loss: 1.3579 - val_accuracy: 0.5386 - val_loss: 1.6503 - learning_rate: 0.0010
Epoch 4/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 485ms/step - accuracy: 0.8488 - loss: 0.6253 - val_accuracy: 0.5224 - val_loss: 1.8299 - learning_rate: 0.0010
Epoch 5/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 464ms/step - accuracy: 0.9253 - loss: 0.3063 - val_accuracy: 0.5249 - val_loss: 1.9832 - learning_rate: 0.0010
Epoch 6/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
