LOAD THE DATA SET

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('grammar_correction_pairs.csv')

# Prepare training data
incorrect_sentences = df['incorrect_sentence'].values
correct_sentences = df['correct_sentence'].values

TOKENIZATION

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(list(incorrect_sentences) + list(correct_sentences))

# Convert text to sequences
X = tokenizer.texts_to_sequences(incorrect_sentences)
y = tokenizer.texts_to_sequences(correct_sentences)

# Pad sequences to make them the same length
X = pad_sequences(X, padding='post')
y = pad_sequences(y, padding='post')

DEFINE THE MODEL

In [3]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, RepeatVector, TimeDistributed

# Define the model
model = Sequential()

# Define embedding layer
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=256, input_length=X.shape[1]))

# Encoder
model.add(LSTM(256))

# Decoder
model.add(RepeatVector(X.shape[1]))  # Repeat the context vector
model.add(LSTM(256, return_sequences=True))
model.add(TimeDistributed(Dense(len(tokenizer.word_index) + 1, activation='softmax')))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



TRAIN THE MODEL

In [4]:
# Train the model
model.fit(X, y, batch_size=64, epochs=20, validation_split=0.2)

Epoch 1/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 124ms/step - accuracy: 0.3672 - loss: 3.0709 - val_accuracy: 0.4629 - val_loss: 4.0673
Epoch 2/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 115ms/step - accuracy: 0.7980 - loss: 0.8388 - val_accuracy: 0.4239 - val_loss: 4.6955
Epoch 3/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 113ms/step - accuracy: 0.8238 - loss: 0.6603 - val_accuracy: 0.4768 - val_loss: 4.6292
Epoch 4/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 121ms/step - accuracy: 0.8832 - loss: 0.4588 - val_accuracy: 0.5253 - val_loss: 4.3920
Epoch 5/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 121ms/step - accuracy: 0.9140 - loss: 0.3253 - val_accuracy: 0.5280 - val_loss: 4.4326
Epoch 6/20
[1m180/180[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 125ms/step - accuracy: 0.9237 - loss: 0.2595 - val_accuracy: 0.5537 - val_loss: 4.2018
Epoch 7/20

<keras.src.callbacks.history.History at 0x788d709bae00>