# Grammatical Error Correction Using Deep Learning

# Literature Survey 

I have referred to this paper “Grammatical Error Checking Systems: A Review of Approaches and Emerging Directions” to do an extensive Literature Survey, which can be found in the below link.

"https://www.researchgate.net/publication/344160222_Recent_Trends_in_the_Use_of_Deep_Learning_Models_for_Grammar_Error_Handling"

# Understanding the data

Dataset - Lang-8 Corpus of Learner English

Data source - "https://docs.google.com/forms/d/e/1FAIpQLSflRX3h5QYxegivjHN7SJ194OxZ4XN_7Rt0cNpR2YbmNV-7Ag/viewform"

Credits - Tomoya Mizumoto, Mamoru Komachi, Masaaki Nagata and Yuji Matsumoto.
Mining Revision Log of Language Learning SNS for Automated Japanese
Error Correction of Second Language Learners. In Proceedings of the
5th International Joint Conference on Natural Language Processing
(IJCNLP), pp.147-155. Chiang Mai, Thailand, November 2011.

Toshikazu Tajiri, Mamoru Komachi and Yuji Matsumoto. Tense and Aspect
Error Correction for ESL Learners Using Global Context. In Proceedings
of the 50th Annual Meeting of the Association for Computational
Linguistics: Short Papers (oral), pp.198-202. Jeju Island, Korea, July 2012.

The data is in M2 format, which consists of a line followed by S denotes an original sentence while a line followed by A indicates an edit annotation there are more than one annotation for an incorrect sentence

Example data format:

S I heard a sentence last night when I watched TV .
A 8 9|||R:VERB:TENSE|||was watching|||REQUIRED|||-NONE-|||0

S We ‘ve known each other for only half a year, but his lesson was a lot of fun.
A 13 14|||R:NOUN:NUM|||lessons|||REQUIRED|||-NONE-|||0
A 14 15|||R:VERB:SVA|||were|||REQUIRED|||-NONE-|||0

Performence Metric:-

GLEU score: To calculate the GLUE score we take all sub-sequences 1, 2, 3, or 4 tokens in output and target sequence, and then compute a recall, which is the ratio of the number of matching n-grams to the number of total n-grams in the target sequence, and a precision, which is the ratio of the number of matching n-grams to the number of total n-grams in the generated output sequence.
GLEU score is simply the minimum of recall and precision.
It ranges between 0 to 1,

# Exploritory Data Analysis

Importing necessary libraries

In [1]:
import os
import tqdm
import argparse
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
import pandas as pd
import re
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense,RNN,Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TimeDistributed
import numpy as np
from wordcloud import WordCloud,STOPWORDS
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
import pickle
import nltk.translate.bleu_score as bleu
from IPython.display import Image

In [2]:
os.chdir(r"D:")
os.getcwd()

'D:\\'

# M2 to CSV Conversion

In [3]:
# Code Source - "https://www.cl.cam.ac.uk/research/nl/bea2019st/data/corr_from_m2.py"
# Apply the edits of a single annotator to generate the corrected sentences.

def main():
    """
    this function stores the correct sentence line by line in as txt file with file name lang8.train.auto.bea19.m2
    """
    
    m2 = open("lang8.train.auto.bea19.m2").read().strip().split("\n\n")
    out = open("lang8.train.auto.bea19.txt", "w")
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}
    
    for sent in m2:
        sent = sent.split("\n")
        cor_sent = sent[0].split()[1:] # Ignore "S "
        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != 0: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        out.write(" ".join(cor_sent)+"\n")      

In [4]:
# Incorrect Sentences Preprocessing

fl1 = open("lang8.train.auto.bea19.m2","r")
sent1 = fl1.read()

Each_Sent = sent1.split("\n\n")

Incorrect = []
for i in range(len(Each_Sent)):
    temp = Each_Sent[i].split("\n")
    temp = temp[0]
    temp = temp.split(" ")
    temp = temp[1:]# ignore S
    temp = ' '.join(temp)
    Incorrect.append(temp)

In [5]:
# Correct Sentences Preprocessing

fl2 = open("lang8.train.auto.bea19.txt","r")
sent2 = fl2.read()

Correct = sent2.split("\n")

FileNotFoundError: [Errno 2] No such file or directory: 'lang8.train.auto.bea19.txt'

In [None]:
# storing Correct and Incorrect sentence pair into dataframe
df = pd.DataFrame()
df["Correct"] = Correct
df["Incorrect"] = Incorrect

#store into csv file named data.csv
df.to_csv("data.csv",index=False)

In [None]:
# reading data
data = pd.read_csv("data.csv")
data.head()

Data Analysis

In [None]:
index = []
for i in range(len(data.values)):
    if data.values[i][0] == data.values[i][1]:
        index.append(i)
            
data = data.drop(index)
data.shape

# Missing Values Check

In [None]:
data.isnull().values.any()

In [None]:
# Remove rows if contain null value
# data.dropna(inplace=True)
# data.shape

# Duplicate Check

In [None]:
data.duplicated().values.any()

In [None]:
# Remove if any
# data.drop_duplicates(inplace=True)
# data.reset_index(inplace=True,drop=True)
# data.shape

# Cleaning Data

In [None]:
def clean(text):
    """
    takes string as input and
    removes characters inside (),{},[] and <>
    removes characters like -+@#^/|*(){}$~`
    we not not removing ,.!-:;"' as these characters are present in english language 
    """
    text = re.sub('<.*>', '', text)
    text = re.sub('\(.*\)', '', text)
    text = re.sub('\[.*\]', '', text)
    text = re.sub('{.*}', '', text)
    text = re.sub("[-+@#^/|*(){}$~`<>=_]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("[0-9]","",text)
    return text

data["Correct"] = data["Correct"].apply(clean)
data["Incorrect"] = df["Incorrect"].apply(clean)

In [None]:
data.isnull().values.any()

# Length Of Correct Sentences Analysis

In [None]:
def percentile(low,high,step,lst1):
    """
    this function takes low, high, step size as input and prints percentiles accordingly
    """
    for i in np.arange(low,high,step):
        print(i,"percentile is ",np.percentile(lst1, i))

At Charecter Level

For Correct Sentences

In [None]:
def sen_to_char(sen):
    return len([i for i in sen])

Corr_length = data["Correct"].apply(sen_to_char)
Corr_length = list(Corr_length)
len(Corr_length)

In [None]:
percentile(0,101,10,Corr_length)

print("***************************************************************")

percentile(90,101,1,Corr_length)

In [None]:
# removing those data points which have Correct sentence of length more than a certain value
index = []
for i in range(len(Corr_length)):
    if Corr_length[i] > 100:
        index.append(i)
        
data.drop(index,inplace=True)
data.reset_index(inplace=True,drop=True)
print(data.shape)

For Incorrect Sentences

In [None]:
Incorr_length = data["Incorrect"].apply(sen_to_char)
Incorr_length = list(Incorr_length)
len(Incorr_length)

In [None]:
percentile(0,101,10,Incorr_length)

print("***********************************************************")

percentile(90,101,1,Incorr_length)

In [None]:
# removing those data points which have Incorrect sentence of length more than a certain value

index = []
for i in range(len(Incorr_length)):
    if Incorr_length[i] > 100:
        index.append(i)
        
data.drop(index,inplace=True)
data.reset_index(inplace=True,drop=True)
print(data.shape)

# At Word Level

In [None]:
Corr_length = data["Correct"].str.split().apply(len)
Corr_length = list(Corr_length)

In [None]:
percentile(0,101,10,Corr_length)

print("***************************************************************")

percentile(90,101,1,Corr_length)

In [None]:
# removing those data points which have correct sentence of length more than a certain value

index = []
for i in range(len(Corr_length)):
    if Corr_length[i] > 100:
        index.append(i)
        
data.drop(index,inplace=True)
data.reset_index(inplace=True,drop=True)
print(data.shape)

In [None]:
Incorr_length = data["Incorrect"].str.split().apply(len)
Incorr_length = list(Incorr_length)

In [None]:
percentile(0,101,10,Incorr_length)

print("***************************************************************")

percentile(90,101,1,Incorr_length)

In [None]:
# removing those data points which have incorrect sentence of length more than a certain value

index = []
for i in range(len(Incorr_length)):
    if Incorr_length[i] > 100:
        index.append(i)
        
data.drop(index,inplace=True)
data.reset_index(inplace=True,drop=True)
print(data.shape)

# Data Splitting

In [None]:
train_temp, cv = train_test_split(data, test_size=0.15)
train, test = train_test_split(train_temp, test_size=0.15)

In [None]:
# Store dataset into disk

train.to_csv("train_word.csv",index=False)
cv.to_csv("cv_word.csv",index=False)
test.to_csv("test_word.csv",index=False)

# Unique words in the dataset

In [None]:
train_data = pd.read_csv("train_word.csv")
cv_data = pd.read_csv("cv_word.csv")
test_data = pd.read_csv("test_word.csv")

Unique words in train dataset

In [None]:
# for Incorrect sentences

split_sent = np.array(train_data["Incorrect"].str.split())

unique = []
for i in split_sent:
    if type(i) == float:
        continue
    for j in i:
        unique.append(j)

unique_words_train_incorr = set(unique)
print("total number of unique words in Incorrect sentences in train data are",len(unique_words_train_incorr))

In [None]:
# for Correct sentences

split_sent = np.array(train_data["Correct"].str.split())

unique = []
for i in split_sent:
    for j in i:
        unique.append(j)

unique_words_train_corr = set(unique)
print("total number of unique words in Correct sentences in train are",len(unique_words_train_corr))

Unique words in cv dataset

In [None]:
# for Incorrect sentences

split_sent = np.array(cv_data["Incorrect"].str.split())

unique = []
for i in split_sent:
    for j in i:
        unique.append(j)

unique_words_cv_incorr = set(unique)
print("total number of unique words in Incorrect sentences in cv data are",len(unique_words_cv_incorr))

In [None]:
# for Correct sentences

split_sent = np.array(cv_data["Correct"].str.split())

unique = []
for i in split_sent:
    for j in i:
        unique.append(j)

unique_words_cv_corr = set(unique)
print("total number of unique words in Correct sentences in cv data are",len(unique_words_cv_corr))

Unique words in test dataset

In [None]:
# for Incorrect sentences

split_sent = np.array(test_data["Incorrect"].str.split())

unique = []
for i in split_sent:
    for j in i:
        unique.append(j)

unique_words_test_incorr = set(unique)
print("total number of unique words in Incorrect sentences in test data are",len(unique_words_cv_incorr))

In [None]:
# for Correct sentences

split_sent = np.array(test_data["Incorrect"].str.split())

unique = []
for i in split_sent:
    for j in i:
        unique.append(j)

unique_words_test_corr = set(unique)
print("total number of unique words in Correct sentences in test are",len(unique_words_test_corr))

# Model Preparation

In [None]:
train_file = pd.read_csv("train.csv")
cv_file = pd.read_csv("cv.csv")
test_file = pd.read_csv("test.csv")

We will add $ to each sentence which will be input to decoder also We will add @ to each sentence which will be output of decoder

In [None]:
train_file["Correct_inp"] = "$" + train_file["Correct"].astype(str) # $ denotes start of sentence
train_file["Correct_out"] = train_file["Correct"].astype(str) + "@" # @ denotes end of sentence

cv_file["Correct_inp"] = "$" + cv_file["Correct"].astype(str)
cv_file["Correct_out"] = cv_file["Correct"].astype(str) + "@"

In [None]:
# code reference https://stackoverflow.com/questions/45735070/keras-text-preprocessing-saving-tokenizer-object-to-file-for-scoring
# loading saved tokenizer

with open("tokenizer_incorr.pickle","rb") as temp1:
    tokenizer_incorr = pickle.load(temp1)
    
with open("tokenizer_corr_inp.pickle","rb") as temp2:
    tokenizer_corr_inp = pickle.load(temp2)
    
with open("tokenizer_corr_out.pickle","rb") as temp3:
    tokenizer_corr_out = pickle.load(temp3)

Tokenization Sentences For Feeding To Encoder

In [None]:
# tokenizer_incorr = Tokenizer(filters="",char_level=True,lower=False)
#tokenizer_incorr.fit_on_texts(train["incorrect"].values)

Incorr_train = np.array(tokenizer_incorr.texts_to_sequences(train_file["Incorrect"].values))
Incorr_cv = np.array(tokenizer_incorr.texts_to_sequences(cv_file["Incorrect"].values))
print("Vocab size of Incorrrect sentences is",len(tokenizer_incorr.word_index))

Tokenizing Senetence For Feeding To Decoder As Inpput

In [None]:
# tokenizer_corr_inp = Tokenizer(filters="",char_level=True,lower=False)
#tokenizer_corr_inp.fit_on_texts(train["correct_inp"].values)

Corr_train_inp = np.array(tokenizer_corr_inp.texts_to_sequences(train_file["Correct_inp"].values))
Corr_cv_inp = np.array(tokenizer_corr_inp.texts_to_sequences(cv_file["Correct_inp"].values))
print("vocab size of Corrrect sentences is",len(tokenizer_corr_inp.word_index))

Tokenizing Senetence That Will Be Output Of Decoder

In [None]:
# tokenizer_corr_out = Tokenizer(filters="",char_level=True,lower=False)
# tokenizer_corr_out.fit_on_texts(train["correct_out"].values)

Corr_train_out = np.array(tokenizer_corr_out.texts_to_sequences(train_file["Correct_out"].values))
Corr_cv_out = np.array(tokenizer_corr_inp.texts_to_sequences(cv_file["Correct_out"].values))

Padding Train, Cv, Test

In [None]:
Incorr_train = np.array(pad_sequences(Incorr_train,maxlen=110, padding="post", truncating='post'))
Corr_train_inp = np.array(pad_sequences(Corr_train_inp, maxlen=110, padding="post", truncating='post'))
Corr_train_out = np.array(pad_sequences(Corr_train_out, maxlen=110, padding="post", truncating='post'))

Incorr_cv = np.array(pad_sequences(Incorr_cv, maxlen=110, padding="post", truncating='post'))
Corr_cv_inp = np.array(pad_sequences(Corr_cv_inp, maxlen=110, padding="post", truncating='post'))
Corr_cv_out = np.array(pad_sequences(Corr_cv_out, maxlen=110, padding="post", truncating='post'))

# MODELLING

Encoder-Decoder

In [None]:
# code taken from "https://github.com/mridul1012/Grammatical-Error-Correction-with-Neural-Networks/tree/main"

############################## Encoder class #############################################################

class Encoder(tf.keras.layers.Layer):
    '''
    Encoder model -- That takes a input sequence and returns encoder-outputs,encoder_final_state_h,encoder_final_state_c
    '''

    def __init__(self,inp_vocab_size,embedding_size,lstm_size,input_length):
        
        super().__init__()
        self.lstm_size = lstm_size
        self.embedding = Embedding(input_dim=inp_vocab_size, output_dim=embedding_size, input_length=input_length,
                           mask_zero=True,name="embedding_layer_encoder")
        self.lstmcell = tf.keras.layers.LSTMCell(lstm_size)
        self.encoder_lstm = RNN(self.lstmcell,return_sequences=True, return_state=True)


    def call(self,input_sequence,states):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to encoder_lstm
          returns -- encoder_output, last time step's hidden and cell state
        '''

        output1 = self.embedding(input_sequence)
        enco_output, enco_state_h, enco_state_c = self.encoder_lstm(output1, initial_state=states)
        return enco_output, enco_state_h, enco_state_c

    
    def initialize_states(self,batch_size):

        initial_hidden_state = tf.zeros([batch_size,self.lstm_size])
        initial_cell_state = tf.zeros([batch_size,self.lstm_size])
        
        return [initial_hidden_state,initial_cell_state]

############################## Decoder class #############################################################

class Decoder(tf.keras.layers.Layer):
    '''
    Encoder model -- That takes a input sequence and returns output sequence
    '''

    def __init__(self,out_vocab_size,embedding_size,lstm_size,input_length):

        super().__init__()
        self.lstm_size = lstm_size
        self.embedding = Embedding(input_dim=out_vocab_size, output_dim=embedding_size, input_length=input_length,
                           mask_zero=True,name="embedding_layer_encoder")
        self.lstmcell = tf.keras.layers.LSTMCell(lstm_size)
        self.decoder_lstm = RNN(self.lstmcell,return_sequences=True, return_state=True)

    def call(self,target_sequence,initial_states):
        '''
          This function takes a sequence input and the initial states of the encoder.
          Pass the input_sequence input to the Embedding layer, Pass the embedding layer ouput to decoder_lstm
        
          returns -- decoder_output,decoder_final_state_h,decoder_final_state_c
        '''
        output2 = self.embedding(target_sequence)
        deco_output, deco_state_h, deco_state_c = self.decoder_lstm(output2, initial_state=initial_states)
      
        return deco_output, deco_state_h, deco_state_c

##############################encoder decoder class#############################################################    
    
qw_state = 0
class Encoder_decoder(tf.keras.Model):
    
    def __init__(self,inp_vocab_size,out_vocab_size,embedding_size,lstm_size,input_length,batch_size,*args):
        
        super().__init__()
        self.encoder = Encoder(inp_vocab_size,embedding_size,lstm_size,input_length)
        #print("output vocab size in encoder decoder class",out_vocab_size)
        self.decoder = Decoder(out_vocab_size,embedding_size,lstm_size,input_length)
        self.dense   = Dense(out_vocab_size)#, activation='softmax')
        self.batch = batch_size
    
    
    def call(self,data,*args):
        '''
        A. Pass the input sequence to Encoder layer -- Return encoder_output,encoder_final_state_h,encoder_final_state_c
        B. Pass the target sequence to Decoder layer with intial states as encoder_final_state_h,encoder_final_state_C
        C. Pass the decoder_outputs into Dense layer 
        
        Return decoder_outputs
        '''
        
        input,output = data[0], data[1]
        # initializing initial states of encoder
        l = self.encoder.initialize_states(self.batch)
        qw_state = l
        #print("WE ARE INITIALIZING encoder WITH initial STATES as zeroes :",l[0].shape, l[1].shape)
        #print("hello")
        encoder_output,encoder_final_state_h,encoder_final_state_c = self.encoder(input,l)
        #print("ENCODER ==> OUTPUT SHAPE",encoder_output.shape)
        #print("ENCODER ==> HIDDEN STATE SHAPE",encoder_final_state_h.shape)
        #print("ENCODER ==> CELL STATE SHAPE", encoder_final_state_c.shape)
        #print("hi")
        m = list((encoder_final_state_h,encoder_final_state_c))
        decoder_output,decoder_final_state_h,decoder_final_state_c = self.decoder(output,m)
        #print("decoder OUTPUT SHAPE",decoder_output.shape)
        #print("type of decoder output is ",type(decoder_output))
        #x = self.flatten(decoder_output)
        #print("shape of x ",x.shape)
        qw_output = self.dense(decoder_output)
        #print("FINAL OUTPUT SHAPE",qw_output.shape)
        return qw_output

# Encoder Decoder Model With Character Embedding

In [None]:
inp_vocab_size = 63
out_vocab_size = 64
embedding_dim=100
input_length=110
lstm_size=256
batch_size=1024
#model = Encoder_decoder(inp_vocab_size,out_vocab_size,embedding_dim,lstm_size,input_length,batch_size)
# custom loss function
optimizer = tf.keras.optimizers.Adam()

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
#defining custom loss function which will not consider loss for padded zeroes
# code taken from attention assignment
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)
#model.compile(optimizer=optimizer,loss=loss_function)

Model Traning

In [None]:
model.fit(x=[Incorr_train, Corr_train_inp], y=Corr_train_out, epochs=10, batch_size=512)

In [None]:
model.summary()

In [None]:
model = Encoder_decoder(inp_vocab_size,out_vocab_size,embedding_dim,lstm_size,input_length,batch_size)
model.compile(optimizer=optimizer,loss=loss_function)
model.train_on_batch([Incorr_train[:1024],Corr_train_inp[:1024]],Corr_train_out[:1024])

# Load the state of the old model

model.load_weights('enco_dec_char')

In [None]:
corr_dict = tokenizer_corr_out.word_index
inv_corr = {v: k for k, v in corr_dict.items()}

def predict(input_sentence):
    """
    this function takes incorrect input sentences s input and retirns correct sentences
    """
    input_sentence = tokenizer_incorr.texts_to_sequences([input_sentence])
    initial_hidden_state = tf.zeros([1,256])
    initial_cell_state = tf.zeros([1,256])
    qwst = [initial_hidden_state,initial_cell_state]
    pred_total = []
    enc_output, enc_state_h, enc_state_c = model.layers[0](np.expand_dims(input_sentence[0],0),qwst)
    states_values = [enc_state_h, enc_state_c]
    pred = []
    sentence = []
    cur_vec = np.array([[16]])#np.ones((1, 1),dtype='int')
    for i in range(110):
        dec_output, dec_state_h, dec_state_c = model.layers[1](cur_vec,states_values)
        infe_output=model.layers[2](dec_output)
        states_values = [dec_state_h, dec_state_c]
        cur_vec = np.reshape(np.argmax(infe_output), (1, 1))
        if inv_corr[cur_vec[0][0]] == '@':
            break
            #print("at time step ",i," the word is ", cur_vec)
        pred.append(cur_vec[0][0])
    for i in pred:
        sentence.append(inv_corr[i])
    #return pred
    return "".join(sentence)

# GLUE Score on Test Data

In [None]:
from nltk.translate.gleu_score import sentence_gleu

gleu_score_test = 0
length = 1000

for i in range(length):
    reference = [test_file["Correct"].values[i:i+1][0].split()]
    candidate = predict(test_file["Incorrect"].values[i:i+1][0]).split()
    gleu_score_test = gleu_score_test + sentence_gleu(reference, candidate)
print("Final GLEU Score on Test data are",gleu_score_test/length)

Result Prediction On Train Data

In [None]:
# Predicted Sentences

for i in train_file["Incorrect"].values[:10]:
  print(predict(i))

In [None]:
# Actual Sentences

train_file["Correct"].values[:10]

Result Prediction On CV Data

In [None]:
# Predicted Sentences

for i in cv_file["Incorrect"].values[:10]:
  print(predict(i))

In [None]:
# Actual Sentences

cv_file["Correct"].values[:10]

Result Prediction On Test Data

In [None]:
# Predicted Sentences

for i in test_file["Incorrect"].values[:10]:
  print(predict(i))

In [None]:
# Actual Sentences

test_file["Correct"].values[:10]