<a href="https://colab.research.google.com/github/Thishanth11/AI_Grammar_checker/blob/main/Grammar_checker_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Data preprocessing

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from google.colab import drive
drive.mount('/content/drive')

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/tamil_dataset.csv")

# Print the column names to inspect them
print(data.columns)

train_sentences, test_sentences, train_targets, test_targets = train_test_split(
    data['Input'], data['Target'], test_size=0.2, random_state=42)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Index(['Input', 'Target'], dtype='object')


In [20]:
# Tokenize sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(pd.concat([train_sentences, train_targets]))

# Convert text to sequences
train_input_seq = tokenizer.texts_to_sequences(train_sentences)
train_target_seq = tokenizer.texts_to_sequences(train_targets)
test_input_seq = tokenizer.texts_to_sequences(test_sentences)
test_target_seq = tokenizer.texts_to_sequences(test_targets)

# Pad sequences
max_length = max(max(len(seq) for seq in train_input_seq), max(len(seq) for seq in train_target_seq))
train_input_seq = pad_sequences(train_input_seq, maxlen=max_length, padding='pre')
train_target_seq = pad_sequences(train_target_seq, maxlen=max_length, padding='pre')
test_input_seq = pad_sequences(test_input_seq, maxlen=max_length, padding='pre')
test_target_seq = pad_sequences(test_target_seq, maxlen=max_length, padding='pre')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1


Define and Train the Model

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# Define the encoder
encoder_inputs = Input(shape=(max_length,))
encoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(128, return_state=True, use_cudnn=False)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Define the decoder
decoder_inputs = Input(shape=(max_length,))
decoder_embedding = Embedding(vocab_size, 128, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(128, return_sequences=True, return_state=True, use_cudnn=False)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Combine into a model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Prepare target data for training
train_target_seq = train_target_seq.reshape(train_target_seq.shape[0], train_target_seq.shape[1], 1)

# Train the model
model.fit([train_input_seq, train_input_seq], train_target_seq, batch_size=32, epochs=25, validation_split=0.2)


Epoch 1/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.0833 - loss: 2.8343 - val_accuracy: 0.2500 - val_loss: 2.8384
Epoch 2/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.3333 - loss: 2.8200 - val_accuracy: 0.2500 - val_loss: 2.8430
Epoch 3/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.5833 - loss: 2.8054 - val_accuracy: 0.2500 - val_loss: 2.8477
Epoch 4/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.6667 - loss: 2.7905 - val_accuracy: 0.2500 - val_loss: 2.8525
Epoch 5/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.6667 - loss: 2.7748 - val_accuracy: 0.2500 - val_loss: 2.8576
Epoch 6/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.7500 - loss: 2.7580 - val_accuracy: 0.2500 - val_loss: 2.8630
Epoch 7/25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x78f946f76830>

 Evaluate the Model

In [28]:
# Evaluate on test data
test_target_seq = test_target_seq.reshape(test_target_seq.shape[0], test_target_seq.shape[1], 1)
loss, accuracy = model.evaluate([test_input_seq, test_input_seq], test_target_seq)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 720ms/step - accuracy: 0.7500 - loss: 0.0000e+00
Test Accuracy: 75.00%


In [29]:
# Function to predict corrected sentences
def predict_sentence(sentence):
    seq = tokenizer.texts_to_sequences([sentence])
    seq = pad_sequences(seq, maxlen=max_length, padding='post')
    prediction = model.predict([seq, seq])
    predicted_sequence = tf.argmax(prediction[0], axis=-1).numpy()
    predicted_sentence = " ".join([tokenizer.index_word.get(idx, "") for idx in predicted_sequence if idx != 0])
    return predicted_sentence

# Test with user input
user_sentence = input("Enter a Tamil sentence: ")
corrected_sentence = predict_sentence(user_sentence)
print("Corrected Sentence:", corrected_sentence)


Enter a Tamil sentence: பார்த்து நான் புத்தகம் வாங்கினேன.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 488ms/step
Corrected Sentence: சமைக்கிறார்
