In [1]:
import pandas as pd

# Specify the file path
file_path = '/content/grammar_correction_pairs.csv'

# Read the CSV file
df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,incorrect_sentence,correct_sentence
0,මම යන්නෙමු,මම යන්නෙමි
1,මම යන්නෙමුවා,මම යන්නෙමි
2,මම යන්නෙහි,මම යන්නෙමි
3,මම යන්නෙහිවා,මම යන්නෙමි
4,මම යන්නේය,මම යන්නෙමි


Model 01 - Regression Model

In [6]:
inputs = df['incorrect_sentence']
targets = df['correct_sentence']

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.2, random_state=42)


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [10]:
y_pred = model.predict(X_test_tfidf)
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.32684283727399166


In [11]:
def preprocess_input(sentence, vectorizer):
    # Transform the input sentence using the trained vectorizer
    sentence_tfidf = vectorizer.transform([sentence])
    return sentence_tfidf


In [12]:
def make_prediction(sentence, vectorizer, model):
    # Preprocess the input sentence
    sentence_tfidf = preprocess_input(sentence, vectorizer)
    # Predict the corrected sentence
    prediction = model.predict(sentence_tfidf)
    return prediction[0]


In [13]:
test_sentence = "මම යන්නෙමු"
corrected_sentence = make_prediction(test_sentence, vectorizer, model)
print("Corrected Sentence:", corrected_sentence)


Corrected Sentence: මම යන්නෙමි


In [74]:
test_sentence = "මම වේගයෙන් ගියෙමු"
corrected_sentence = make_prediction(test_sentence, vectorizer, model)
print("Corrected Sentence:", corrected_sentence)


Corrected Sentence: මම වේගයෙන් යැවෙමි


In [77]:
test_sentence = "ලස්සන වාහනය ගෙදර යවමු"
corrected_sentence = make_prediction(test_sentence, vectorizer, model)
print("Corrected Sentence:", corrected_sentence)


Corrected Sentence: ලස්සන වාහනය ගෙදර යවයි


In [79]:
test_sentence = "අපි යැවවේ"
corrected_sentence = make_prediction(test_sentence, vectorizer, model)
print("Corrected Sentence:", corrected_sentence)


Corrected Sentence: අපි යැවෙමු


In [80]:
test_sentence = "වාහන පොත බලා වේගයෙන් ගියෙමි"
corrected_sentence = make_prediction(test_sentence, vectorizer, model)
print("Corrected Sentence:", corrected_sentence)


Corrected Sentence: වාහන පොත බලා වේගයෙන් යවයි


Model 02 - LSTM model

In [39]:
!pip install tensorflow transformers




In [40]:
import pandas as pd

file_path = '/content/grammar_correction_pairs.csv'
data = pd.read_csv(file_path)

# Inspect the dataset
print(data.head())


  incorrect_sentence correct_sentence
0         මම යන්නෙමු       මම යන්නෙමි
1       මම යන්නෙමුවා       මම යන්නෙමි
2         මම යන්නෙහි       මම යන්නෙමි
3       මම යන්නෙහිවා       මම යන්නෙමි
4          මම යන්නේය       මම යන්නෙමි


In [41]:
# Access the 'incorrect_sentence' and 'correct_sentence' columns from the DataFrame 'data'
train_incorrect, test_incorrect, train_correct, test_correct = train_test_split(
    data['incorrect_sentence'], data['correct_sentence'], test_size=0.2, random_state=42
)

In [50]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, TimeDistributed,Bidirectional

In [42]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_incorrect + train_correct)

train_incorrect_sequences = tokenizer.texts_to_sequences(train_incorrect)
train_correct_sequences = tokenizer.texts_to_sequences(train_correct)

max_len = max(
    max(len(seq) for seq in train_incorrect_sequences),
    max(len(seq) for seq in train_correct_sequences)
)
train_incorrect_padded = pad_sequences(train_incorrect_sequences, maxlen=max_len, padding='post')
train_correct_padded = pad_sequences(train_correct_sequences, maxlen=max_len, padding='post')

train_correct_padded = train_correct_padded[..., None]

vocab_size = len(tokenizer.word_index) + 1

# Build the LSTM model
lstm_model3 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    # Add a Bidirectional LSTM layer
    Bidirectional(LSTM(128, return_sequences=True)),
    # Add a Dropout layer for regularization
    Dropout(0.2),
    # Add another LSTM layer for deeper processing
    LSTM(128, return_sequences=True),
    # Add another Dropout layer
    Dropout(0.2),
    TimeDistributed(Dense(vocab_size, activation='softmax'))
])

lstm_model3.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Summarize the model





In [43]:
lstm_model3.fit(
    train_incorrect_padded, train_correct_padded,
    epochs=10,
    batch_size=64,
    validation_split=0.2
)


Epoch 1/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 190ms/step - accuracy: 0.2172 - loss: 4.7659 - val_accuracy: 0.2935 - val_loss: 2.9270
Epoch 2/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 100ms/step - accuracy: 0.3115 - loss: 2.7952 - val_accuracy: 0.4846 - val_loss: 2.2576
Epoch 3/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 160ms/step - accuracy: 0.5364 - loss: 2.0633 - val_accuracy: 0.6758 - val_loss: 1.4778
Epoch 4/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 100ms/step - accuracy: 0.6935 - loss: 1.3889 - val_accuracy: 0.7545 - val_loss: 1.0611
Epoch 5/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 114ms/step - accuracy: 0.7532 - loss: 1.0549 - val_accuracy: 0.7736 - val_loss: 0.8902
Epoch 6/10
[1m144/144[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 105ms/step - accuracy: 0.7780 - loss: 0.8887 - val_accuracy: 0.8160 - val_loss: 0.7655
Epoch 7/10

<keras.src.callbacks.history.History at 0x78fb62cce620>

In [47]:
# Prepare test data
test_incorrect_sequences = tokenizer.texts_to_sequences(test_incorrect)
test_correct_sequences = tokenizer.texts_to_sequences(test_correct)

test_incorrect_padded = pad_sequences(test_incorrect_sequences, maxlen=max_len, padding='post')
test_correct_padded = pad_sequences(test_correct_sequences, maxlen=max_len, padding='post')[..., None]

# Evaluate the model on test data
results = lstm_model3.evaluate(test_incorrect_padded, test_correct_padded)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

# Example prediction
def predict_sentence(input_sentence):
    input_sequence = tokenizer.texts_to_sequences([input_sentence])
    input_padded = pad_sequences(input_sequence, maxlen=max_len, padding='post')
    predictions = lstm_model3.predict(input_padded)
    predicted_sequence = tf.argmax(predictions[0], axis=-1).numpy()
    # Remove <OOV> tokens from the predicted sequence
    predicted_sentence = " ".join(
        [word for word in tokenizer.sequences_to_texts([predicted_sequence])[0].split() if word != "<OOV>"]
    )
    return predicted_sentence


# Test with an example
input_sentence = "වාහන පොත බලා ගෙදර වේගයෙන් යැවෙමු"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 48ms/step - accuracy: 0.8837 - loss: 0.4787
Test Loss: 0.47915029525756836, Test Accuracy: 0.8829723596572876
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Input Sentence: වාහන පොත බලා ගෙදර වේගයෙන් යැවෙමු
Predicted Sentence: වාහන පොත බලා ගෙදර වේගයෙන් යැවේ


In [48]:
input_sentence = "මම යන්නෙමු"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Input Sentence: මම යන්නෙමු
Predicted Sentence: මම යැවෙමි


In [53]:
input_sentence = "ළමයි යනවා"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Input Sentence: ළමයි යනවා
Predicted Sentence: ළමයි යති


In [57]:
input_sentence = "මම වේගයෙන් ගියෙමු"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Input Sentence: මම වේගයෙන් ගියෙමු
Predicted Sentence: මම වේගයෙන් යවමි


In [58]:
input_sentence = "නුබ ඔහුගෙන් පොතක් ගන්වමි"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Input Sentence: නුබ ඔහුගෙන් පොතක් ගන්වමි
Predicted Sentence: නුබ ඔහුගෙන් පොතක් ගන්වයි


In [72]:
input_sentence = "ලස්සන වාහනය ගෙදර යවමු"
predicted_sentence = predict_sentence(input_sentence)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Input Sentence: ලස්සන වාහනය ගෙදර යවමු
Predicted Sentence: ලස්සන වාහනය ගෙදර යවයි


In [52]:
# Save the entire model to a file
lstm_model3.save("/content/drive/MyDrive/lstm_model3.h5")
print("Model saved successfully as lstm_model3.h5")



Model saved successfully as lstm_model3.h5
