In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import sentencepiece as spm
import re

# Load and preprocess the dataset
file_path = "/content/drive/MyDrive/AI/Error Annotated Corpus.csv"  # Update path as needed
df = pd.read_csv(file_path)

df_cleaned = df[['Error word & consecutive word', 'Corrected words & its', 'Annotation']].dropna()
df_cleaned.columns = ['error_text', 'corrected_text', 'annotation']

# Clean Tamil text
def clean_tamil_text(text):
    text = re.sub(r'[^\u0B80-\u0BFF\s]', '', str(text))  # Adjusted Unicode range for Tamil
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_cleaned['error_text'] = df_cleaned['error_text'].apply(clean_tamil_text)
df_cleaned['corrected_text'] = df_cleaned['corrected_text'].apply(clean_tamil_text)

# Combine all text for tokenizer training
all_text = df_cleaned['error_text'].tolist() + df_cleaned['corrected_text'].tolist()
with open("tamil_text.txt", "w") as f:
    f.write("\n".join(all_text))

# Train a SentencePiece tokenizer
spm.SentencePieceTrainer.train(
    input="tamil_text.txt", model_prefix="tamil", vocab_size=5000, model_type="unigram"
)
tokenizer = spm.SentencePieceProcessor(model_file="tamil.model")

# Tokenize and pad sequences
max_length = 50
def tokenize_and_pad(texts):
    sequences = [tokenizer.encode_as_ids(text) for text in texts]
    return pad_sequences(sequences, maxlen=max_length, padding='post')

# Grammar Correction Task
gc_X = tokenize_and_pad(df_cleaned['error_text'])
gc_y = tokenize_and_pad(df_cleaned['corrected_text'])
# Remove the squeeze operation:
# gc_y = gc_y.squeeze(-1)  # Adjust shape for sparse categorical cross-entropy

gc_X_train, gc_X_test, gc_y_train, gc_y_test = train_test_split(gc_X, gc_y, test_size=0.2, random_state=42)

# Classification Task
label_encoder = LabelEncoder()
df_cleaned['annotation'] = label_encoder.fit_transform(df_cleaned['annotation'])
cls_X = tokenize_and_pad(df_cleaned['error_text'])
cls_y = df_cleaned['annotation']

cls_X_train, cls_X_test, cls_y_train, cls_y_test = train_test_split(cls_X, cls_y, test_size=0.2, random_state=42)

# Grammar Correction Model (Seq2Seq)
gc_vocab_size = tokenizer.vocab_size()
gc_input = Input(shape=(max_length,))
gc_embedding = Embedding(input_dim=gc_vocab_size, output_dim=256)(gc_input)
gc_lstm = LSTM(256, return_sequences=True, return_state=True)
gc_lstm_out, _, _ = gc_lstm(gc_embedding)
gc_dense = Dense(gc_vocab_size, activation="softmax")(gc_lstm_out)
gc_model = Model(gc_input, gc_dense)
gc_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Early stopping and learning rate reduction
gc_early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
gc_reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

print("Training Grammar Correction Model...")
gc_history = gc_model.fit(
    gc_X_train, gc_y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=[gc_early_stop, gc_reduce_lr]
)

# Error Classification Model
cls_vocab_size = tokenizer.vocab_size()
cls_input = Input(shape=(max_length,))
cls_embedding = Embedding(input_dim=cls_vocab_size, output_dim=256)(cls_input)
cls_lstm = Bidirectional(LSTM(128))(cls_embedding)
cls_dropout = Dropout(0.5)(cls_lstm)
cls_output = Dense(len(label_encoder.classes_), activation="softmax")(cls_dropout)
cls_model = Model(cls_input, cls_output)
cls_model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

cls_early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
cls_reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)

print("Training Classification Model...")
cls_history = cls_model.fit(
    cls_X_train, cls_y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32,
    callbacks=[cls_early_stop, cls_reduce_lr]
)

# Evaluate Models
print("Evaluating Grammar Correction Model...")
gc_y_pred = np.argmax(gc_model.predict(gc_X_test, verbose=0), axis=-1)
gc_eval = gc_model.evaluate(gc_X_test, gc_y_test, verbose=0)
print("Grammar Correction Model - Loss: {:.4f}, Accuracy: {:.4f}".format(gc_eval[0], gc_eval[1]))

print("Evaluating Classification Model...")
cls_eval = cls_model.evaluate(cls_X_test, cls_y_test, verbose=0)
print("Classification Model - Loss: {:.4f}, Accuracy: {:.4f}".format(cls_eval[0], cls_eval[1]))

# Save Models
gc_model.save("grammar_correction_model.h5")
cls_model.save("error_classification_model.h5")

# Utility to decode sequences
def decode_sequence(sequence):
    return tokenizer.decode_ids(sequence.tolist())

# Load pre-trained grammar correction model
gc_model = load_model("grammar_correction_model.h5")



Training Grammar Correction Model...
Epoch 1/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 843ms/step - accuracy: 0.8811 - loss: 2.8329 - val_accuracy: 0.9297 - val_loss: 0.6304 - learning_rate: 0.0010
Epoch 2/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 865ms/step - accuracy: 0.9298 - loss: 0.6089 - val_accuracy: 0.9299 - val_loss: 0.6131 - learning_rate: 0.0010
Epoch 3/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 818ms/step - accuracy: 0.9303 - loss: 0.5853 - val_accuracy: 0.9301 - val_loss: 0.6001 - learning_rate: 0.0010
Epoch 4/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 845ms/step - accuracy: 0.9305 - loss: 0.5616 - val_accuracy: 0.9301 - val_loss: 0.5795 - learning_rate: 0.0010
Epoch 5/10
[1m101/101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 824ms/step - accuracy: 0.9317 - loss: 0.5291 - val_accuracy: 0.9306 - val_loss: 0.5647 - learning_rate: 0.0010
Epoch 6/10
[1



Grammar Correction Model - Loss: 0.5567, Accuracy: 0.9316
Evaluating Classification Model...




Classification Model - Loss: 1.7751, Accuracy: 0.5299




In [None]:
# Get user input and predict corrected sentence
while True:
    user_input = input("Enter a Tamil sentence with grammatical errors (or 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        break
    cleaned_input = clean_tamil_text(user_input)
    tokenized_input = pad_sequences([tokenizer.encode_as_ids(cleaned_input)], maxlen=max_length, padding='post')
    predicted_output = np.argmax(gc_model.predict(tokenized_input, verbose=0), axis=-1)
    corrected_sentence = decode_sequence(predicted_output[0])
    print("Corrected Sentence:", corrected_sentence)

Enter a Tamil sentence with grammatical errors (or 'exit' to quit): பேசாம ஏன்
Corrected Sentence:  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇  ⁇ 
Enter a Tamil sentence with grammatical errors (or 'exit' to quit): exit


In [None]:
import tensorflow as tf
import sentencepiece as spm
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load models and tokenizer
gc_model = tf.keras.models.load_model("grammar_correction_model.h5")
tokenizer = spm.SentencePieceProcessor(model_file="tamil.model")

# Function to decode sequences
def decode_sequence(encoded_sequence):
    """Convert sequence of token IDs back to a sentence."""
    decoded_words = tokenizer.decode_ids([id_ for id_ in encoded_sequence if id_ > 0])
    return decoded_words

# Set tf.function outside loop for optimized prediction
@tf.function
def predict_corrected_sentence(padded_input):
    """Predict corrected sequence for input."""
    predicted_output = gc_model(padded_input, training=False)
    predicted_sequence = tf.argmax(predicted_output, axis=-1)[0]  # Removed .numpy() call
    return predicted_sequence

# Main input loop
max_length = 50
while True:
    user_input = input("Enter a Tamil sentence with grammatical errors (or 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        break
    # Tokenize and pad input sequence
    tokenized_input = tokenizer.encode_as_ids(user_input)
    padded_input = pad_sequences([tokenized_input], maxlen=max_length, padding='post')

    # Predict and decode corrected output
    corrected_sequence = predict_corrected_sentence(tf.constant(padded_input, dtype=tf.int32))
    corrected_sentence = decode_sequence(corrected_sequence)

    print("Corrected Sentence:", corrected_sentence)




Enter a Tamil sentence with grammatical errors (or 'exit' to quit): பேசாம ஏன்
Corrected Sentence: 
Enter a Tamil sentence with grammatical errors (or 'exit' to quit): exit
