Data Preprocessing:

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('grammar_correction_pairs.csv')

# Prepare training data
incorrect_sentences = df['incorrect_sentence'].values
correct_sentences = df['correct_sentence'].values

Tokenization:

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(incorrect_sentences) + list(correct_sentences))

# Convert text to sequences
X = tokenizer.texts_to_sequences(incorrect_sentences)
y = tokenizer.texts_to_sequences(correct_sentences)

# Pad sequences to make them the same length
X = pad_sequences(X, padding='post')
y = pad_sequences(y, padding='post')

Define the Model:


In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, RepeatVector, TimeDistributed

# Define the model
model = Sequential()

# Define embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256, input_length=X.shape[1]))

# Encoder
model.add(LSTM(256))

# Decoder
model.add(RepeatVector(X.shape[1]))  # Repeat the context vector
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



Train the Model:

In [5]:
# Train the model
model.fit(X, y, batch_size=64, epochs=20, validation_split=0.2)

Epoch 1/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 147ms/step - accuracy: 0.8389 - loss: 0.6173 - val_accuracy: 0.4943 - val_loss: 4.3105
Epoch 2/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 140ms/step - accuracy: 0.9022 - loss: 0.4108 - val_accuracy: 0.5163 - val_loss: 4.2136
Epoch 3/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 133ms/step - accuracy: 0.9186 - loss: 0.2952 - val_accuracy: 0.5391 - val_loss: 4.2598
Epoch 4/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 146ms/step - accuracy: 0.9317 - loss: 0.2260 - val_accuracy: 0.5631 - val_loss: 4.0822
Epoch 5/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 127ms/step - accuracy: 0.9468 - loss: 0.1801 - val_accuracy: 0.5682 - val_loss: 4.2124
Epoch 6/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 152ms/step - accuracy: 0.9655 - loss: 0.1304 - val_accuracy: 0.5901 - val_loss: 4.1724
Epoch 7/20

<keras.src.callbacks.history.History at 0x7d24ac218e50>

Inference:


In [6]:
# Function to predict the corrected sentence
def correct_grammar(input_sentence):
    seq_input = tokenizer.texts_to_sequences([input_sentence])
    padded_input = pad_sequences(seq_input, padding='post', maxlen=X.shape[1])

    pred = model.predict(padded_input)

    # Convert prediction to words
    pred_sentence = ' '.join([tokenizer.index_word.get(idx, '') for idx in pred[0].argmax(axis=-1)])
    return pred_sentence

User Input:

In [7]:
# Get user input and correct grammar
user_input = input("Enter a sentence with possible grammar errors: ")
corrected_sentence = correct_grammar(user_input)
print("Corrected sentence:", corrected_sentence)

Enter a sentence with possible grammar errors: මම යන්නෙමු
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 524ms/step
Corrected sentence: මම යන්නෙමි     


Grammar Correction Function for Paragraphs:

In [8]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to split paragraph into sentences by period (.)
def split_paragraph_into_sentences(paragraph):
    # Split the paragraph by punctuation mark (.)
    sentences = paragraph.split(".")

    # Clean up sentences (remove empty sentences after split)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    return sentences

# Function to correct a single sentence (using the trained model)
def correct_grammar_for_sentence(input_sentence, tokenizer, model, max_input_length):
    # Tokenize and pad the input sentence
    seq_input = tokenizer.texts_to_sequences([input_sentence])
    padded_input = pad_sequences(seq_input, padding='post', maxlen=max_input_length)

    # Predict the corrected sentence
    pred = model.predict(padded_input)

    # Convert prediction to words
    pred_sentence = ' '.join([tokenizer.index_word.get(idx, '') for idx in pred[0].argmax(axis=-1)])
    return pred_sentence

# Function to correct grammar in an entire paragraph
def correct_grammar_in_paragraph(paragraph, tokenizer, model, max_input_length):
    # Step 1: Split the paragraph into sentences
    sentences = split_paragraph_into_sentences(paragraph)

    # Step 2: Correct each sentence
    corrected_sentences = [correct_grammar_for_sentence(sentence, tokenizer, model, max_input_length) for sentence in sentences]

    # Step 3: Join the corrected sentences back into a paragraph
    corrected_paragraph = '. '.join(corrected_sentences) + '.' if corrected_sentences else ''

    return corrected_paragraph

Paragraph 01:

In [9]:
user_paragraph = "මම යන්නෙමු. අපි යනවා. මම යවන්නෙමි. මම යවන්නෙහි. මම ගියාය. අපි යනවාලා."
corrected_paragraph = correct_grammar_in_paragraph(user_paragraph, tokenizer, model, X.shape[1])

print("Original Paragraph: ", user_paragraph)
print("Corrected Paragraph: ", corrected_paragraph)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Original Paragraph:  මම යන්නෙමු. අපි යනවා. මම යවන්නෙමි. මම යවන්නෙහි. මම ගියාය. අපි යනවාලා.
Corrected Paragraph:  මම යන්නෙමි     . මම යැවෙමි     . මම යවන්නෙමි     . මම යවන්නෙමි     . මම ගියෙමි     . මම යැවෙමි     .


Paragraph 02:

In [18]:
user_paragraph = "මම යන්නෙමුවා. මම යන්නෙමි. මම යන්නෙහි. මම යන්නෙමි. මම යන්නෝය. මම යන්නෙමි. මම යමු. මම යමි. මම යවති මම යවමි. \
                  මම යවන්නෙහි. මම යවන්නෙමි. මම යවන්නෙමු. මම යවන්නෙමි. මම යව් මම යවමි."
corrected_paragraph = correct_grammar_in_paragraph(user_paragraph, tokenizer, model, X.shape[1])

print("Original Paragraph: ", user_paragraph)
print("Corrected Paragraph: ", corrected_paragraph)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27

Paragraph 03:

In [14]:
user_paragraph = "මම යනවා. මම ගියෙහි මම ගියෙමි. මම යැවෙති මම යැවෙමි. මම යැවවෙයි. මම යැවෙමු. මම යන්නෙමුවා. මම යන්නෙහි."
corrected_paragraph = correct_grammar_in_paragraph(user_paragraph, tokenizer, model, X.shape[1])

print("Original Paragraph: ", user_paragraph)
print("Corrected Paragraph: ", corrected_paragraph)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Original Paragraph:  මම යනවා. මම ගියෙහි මම ගියෙමි. මම යැවෙති මම යැවෙමි. මම යැවවෙයි. මම යැවෙමු. මම යන්නෙමුවා. මම යන්නෙහි.
Corrected Paragraph:  වාහන      . මම ගෙදර ගියෙමි    . මම මම යැවෙමි    . මම යැවෙමි     . මම යැවෙමි     . මම යන්නෙමි     . මම යන්නෙමි     .


Paragraph 04:

In [16]:
user_paragraph = "අපි යන්නෝය. අපි යමු. අපි යන්න. අපි යවමු. අපි ගියෙහු. අපි ගියෙමු. අපි යැවවෙති. අපි යැවෙමු. අපි යනවාලා. අපි යවන්නෙමු."
corrected_paragraph = correct_grammar_in_paragraph(user_paragraph, tokenizer, model, X.shape[1])

print("Original Paragraph: ", user_paragraph)
print("Corrected Paragraph: ", corrected_paragraph)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Original Paragraph:  අපි යන්නෝය. අපි යමු. අපි යන්න. අපි යවමු. අපි ගියෙහු. අපි ගියෙමු. අපි යැවවෙති. අපි යැවෙමු. අපි යනවාලා. අපි යවන්නෙමු.
Corrected Paragraph:  අපි යන්නෙමු     . අපි යන්නෙමු     . මම යැවෙමි     . අපි යවන්නෙමු     . අපි ගියෙමු     . අපි යවමු     . අපි යැවෙමු     . අ

Paragraph 05:

In [17]:
user_paragraph = "මම යවන්නීය. මම යවන්නෙමි. මම යවන්නෙහු මම යවන්නෙමි. මම යමු. මම යවමි. මම ගියෙය. මම ගියෙමු. මම යතී මම යවති."
corrected_paragraph = correct_grammar_in_paragraph(user_paragraph, tokenizer, model, X.shape[1])

print("Original Paragraph: ", user_paragraph)
print("Corrected Paragraph: ", corrected_paragraph)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Original Paragraph:  මම යවන්නීය. මම යවන්නෙමි. මම යවන්නෙහු මම යවන්නෙමි. මම යමු. මම යවමි. මම ගියෙය. මම ගියෙමු. මම යතී මම යවති.
Corrected Paragraph:  මම යමි     . මම යවන්නෙමි     . මම ඔහුට පොතක්    . මම යමි     . මම යවන්නෙමි     . වාහන      . මම ගියෙමි     . මම යමි     .
