## Import Libraries

In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np

## Master Function & convert to dataframe

In [3]:
#https://www.cl.cam.ac.uk/research/nl/bea2019st/data/corr_from_m2.py
def m2_to_df(m2_file_path,id=0):
    '''This function takes m2 file path as input and converts it to pandas dataframe'''

    m2 = open(m2_file_path).read().strip().split("\n\n")
    # Do not apply edits with these error types
    skip = {"noop", "UNK", "Um"}

    correct_sent_array = []
    incorrect_sent_array = []

    for sent in tqdm(m2):
        sent = sent.split("\n")
        incor_sent = sent[0].split()[1:] # Ignore "S "
        incorrect_sent_array.append(str(' '.join(incor_sent))) 
        cor_sent = incor_sent.copy()

        edits = sent[1:]
        offset = 0
        for edit in edits:
            edit = edit.split("|||")
            if edit[1] in skip: continue # Ignore certain edits
            coder = int(edit[-1])
            if coder != id: continue # Ignore other coders
            span = edit[0].split()[1:] # Ignore "A "
            start = int(span[0])
            end = int(span[1])
            cor = edit[2].split()
            cor_sent[start+offset:end+offset] = cor
            offset = offset-(end-start)+len(cor)
        correct_sent_array.append(str(' '.join(cor_sent)))

    df = pd.DataFrame()
    df["correct"] = correct_sent_array
    df["incorrect"] = incorrect_sent_array
    return df

In [4]:
m2_file_path = '/content/drive/MyDrive/Self Case studies/CS02 Grammar Error Corrector/lang8.bea19/lang8.train.auto.bea19.m2'
final_df = m2_to_df(m2_file_path)

100%|██████████| 1037561/1037561 [00:06<00:00, 158261.04it/s]


## Check data

In [None]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
200156,But I could n't enjoy the beautiful view from ...,But I could n't enjoy the beatuful view from t...
380530,Everyone complained and the others laughed .,Anyone complained and the others laughed .
248871,This comic is very interesting .,This comic is very interesting .
114901,"I have never been there , but I watced about i...","I have never been to there , but I watced on T..."
70067,Please correct my English !,Please correct my English !


In [None]:
def show_random_datapoints(n_samples,df):
    for i in range(n_samples):
        id = int(np.random.uniform(0,len(df)))

        if len(df['correct'].iloc[id].split())>5 and df['correct'].iloc[id] != df['incorrect'].iloc[id]:
            print(f"CORRE: {df['correct'].iloc[id]}")
            print(f"INCOR: {df['incorrect'].iloc[id]}")
            print('*'*100)

In [None]:
show_random_datapoints(10,final_df)

CORRE: so if anyone knows of some good ways to get good sleep , please let me know : )
INCOR: so if anyone knows something good way to good sleep , please let me know : )
****************************************************************************************************
CORRE: Honestly speaking , my weak point is dealing with `` mama 's boys `` .
INCOR: Honestly speaking , my weak point is dealing with `` mama 's boy `` .
****************************************************************************************************
CORRE: I am curious about how they learn to do it , besides having quick reaction , they must have great balance .
INCOR: I am curious about how they practice it , they must have great balance besides quick reaction .
****************************************************************************************************
CORRE: Today , I called the hospital and I was able to get an apointment .
INCOR: Today , I try to call hospital and I was able to get an apointment .
**

In [None]:
final_df.shape

(1037561, 2)

In [5]:
final_df.to_csv('C:/Users/prash/Desktop/Now100 Assignment/data/final_df_.csv',index=False)