## Import libraries & data

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
final_df = pd.read_csv('C:/Users/prash/Desktop/New_now100/data/final_df_.csv')

In [3]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
1891,2 protoing Brackets Ã¢â¬â This will be my m...,2. Mounting Brackets Ã¢â¬â These will make ...
3776,Especially PP200C Generator Output Frequency: ...,PP200C Generator Output Frequency: 50 Hz Volta...
3695,How many people delay or did energy nothing be...,How many people delayed or did not get care be...
7536,c. Employs less than 150 have fult-time and pa...,c. Employs fewer than 150 full-time and part-t...
9577,"Not blame yourself, this prerequisite touch bu...","Don't blame yourself, this prerequisite touche..."


In [4]:
final_df.shape

(10003, 2)

### Adding length features

In [5]:
final_df['correct_char_count'] = final_df['correct'].astype('str').apply(lambda x:len(x))
final_df['incorrect_char_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x))

In [6]:
final_df['correct_word_count'] = final_df['correct'].astype('str').apply(lambda x:len(x.split()))
final_df['incorrect_word_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [7]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
3172,"Definitely Jen, it is so much fun sharing book...","Definitely Jen, it is so much fun sharing book...",65,69,12,13
1533,It was an afternoon when both fans and players...,It was an afternoon when both fans and players...,143,141,26,25
104,"Now, maybe itâs not going to be the greatest...","Now, maybe itâs not going to be the greatest...",163,152,32,30
6890,Maing links from these huge internet sates is ...,Making links from these huge internet sites is...,170,176,27,29
5077,The facials carried out at same Beverly Hills ...,The facials are carried out at the same Beverl...,114,122,16,18


## Preprocessing

### Removing Missing/NA 

In [8]:
pd.DataFrame(final_df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
correct,0
incorrect,0
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [9]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [10]:
final_df = final_df.dropna().reset_index(drop=True)

In [11]:
final_df.shape

(10003, 6)

In [12]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
2460,"When coming to see tutor, make sure you are pr...","When coming to see a tutor, make sure you are ...",53,55,10,11
3955,arvato systems As global next generation IT sy...,arvato systems As global next generation IT sy...,228,232,34,35
462,"Fit on Mossberg 500, 590 12ga Shotgun.","Fit on Mossberg 500, 590 12ga shotguns.",38,39,7,7
9232,"NEW YORK --(BUSINESS WIRE)-- On March 24, 2015...","NEW YORK --(BUSINESS WIRE)-- On March 24, 2015...",284,285,48,48
1573,Depenty generated in 1.0426 seconds.,Page generated in 1.0426 seconds.,36,33,5,5


### Keep unique sentence pairs

In [13]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 59


In [14]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 59


In [15]:
final_df[final_df['correct']==final_df['incorrect']].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
6201,Describe one of your passions.,Describe one of your passions.,30,30,5,5
4607,"Toothbrushes, with their high profile, tend to...","Toothbrushes, with their high profile, tend to...",79,79,13,13
5433,"We focus on Chinese medicine, so our therapies...","We focus on Chinese medicine, so our therapies...",68,68,10,10
4318,white squirrels olney il 5 .,white squirrels olney il 5 .,28,28,6,6
9722,There will be a memorial service at Bethel Bap...,There will be a memorial service at Bethel Bap...,96,96,17,17
1857,"particular âmonth of Mayâ (âC.T., Prol.â...","particular âmonth of Mayâ (âC.T., Prol.â...",54,54,7,7
8712,"December 21 at 11:19 p.m., eastern standard time.","December 21 at 11:19 p.m., eastern standard time.",49,49,8,8
7940,This version was contributed by Rosemarie Baro...,This version was contributed by Rosemarie Baro...,75,75,10,10
8905,When is Chronic Ink Tattoo Vancouver Opening?,When is Chronic Ink Tattoo Vancouver Opening?,45,45,7,7
9081,How Does Toro Lawn Mower HORIZON Know-how Work?,How Does Toro Lawn Mower HORIZON Know-how Work?,47,47,8,8


In [16]:
final_df = final_df[final_df['correct']!=final_df['incorrect']]

In [17]:
final_df.shape

(9944, 6)

In [18]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
4204,To uncompress a zip file click on the file and...,"To un-compress a ZIP file, click on the file a...",104,107,21,20
3068,Is the MPFL another connection with medial ret...,Is the MPFL another term for medial retinaculum ?,56,49,9,9
2287,"Clearly see every stitch, fabric a d thread co...","Clearly see every stitch, fabric and thread co...",78,78,14,13
7185,"In addition to providing such recommendations,...","In addition to providing such recommendations,...",224,211,34,33
8875,"The Central defender Dandy, 17, was brought in...","Central defender Dandy, 17, has been brought i...",139,134,23,23


### Remove Duplicates

In [19]:
print(f'total number of duplicates: {final_df.duplicated().sum()}')

total number of duplicates: 0


In [20]:
final_df[final_df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [21]:
final_df = final_df.drop_duplicates().reset_index(drop=True)

In [22]:
final_df.shape

(9944, 6)

In [23]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
8360,"Quilt by the piece with 1 1/2"" grids in the la...","Quilted by the piece, with 1 1/2"" grids in the...",121,122,24,24
9120,We have matching mens hawaiian shirt available,We have matching mens hawaiian shirt available.,46,47,7,7
9225,Mike Marley is vice president in World Steel E...,Mike Marley is vice president of World Steel E...,207,216,30,32
3475,"Logan Allison, of living Gallipolis and Breann...","Logan Allison, of Gallipolis, and Breanne Bonn...",168,166,26,25
5707,"See Philip H. Towner, The Letters to Timothy a...","See Philip H. Towner, The Letters to Timothy a...",247,247,34,35


### Remove Small sentences



In [24]:
final_df[final_df['incorrect_char_count']<2].shape

(0, 6)

In [25]:
final_df = final_df[final_df['incorrect_char_count']>2].reset_index(drop=True)

In [26]:
final_df.shape

(9944, 6)

In [27]:
final_df[final_df['correct_char_count']<2].shape

(0, 6)

In [28]:
#final_df[final_df['correct_char_count']<2].sample(10)

ValueError: a must be greater than 0 unless no samples are taken

In [29]:
final_df = final_df[final_df['correct_char_count']>2].reset_index(drop=True)

In [30]:
final_df.shape

(9944, 6)

### Clean text

In [31]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [32]:
# https://stackoverflow.com/a/47091490/4084039
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*\(.*?\)\s', '', text)
    text = re.sub('\s*\[.*?\]\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [33]:
final_df['correct'] = final_df['correct'].progress_apply(clean)
final_df['correct'] = final_df['correct'].progress_apply(expand_contractions)

  0%|          | 0/9944 [00:00<?, ?it/s]

  0%|          | 0/9944 [00:00<?, ?it/s]

In [34]:
final_df['incorrect'] = final_df['incorrect'].progress_apply(clean)
final_df['incorrect'] = final_df['incorrect'].progress_apply(expand_contractions)

  0%|          | 0/9944 [00:00<?, ?it/s]

  0%|          | 0/9944 [00:00<?, ?it/s]

In [35]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
3557,The list is too huge and I will be thanking ev...,"The list is huge, and I will be thanking every...",441,465,82,84
2895,There are some places where we can and must ac...,There are some places where we can and must ac...,181,185,31,32
5546,And we wanted all of her patty peeps to be atb...,And we wanted all of her peeps to be able to c...,80,72,18,17
2746,matthew Dietz litigation director of Disabilit...,"Matthew Dietz, litigation director with the Di...",95,106,12,14
7762,If youâre Battling the Idola youâll earn a...,"If youâre Battling the Idola, youâll earn ...",73,75,13,13


In [36]:
final_df.isna().sum()

correct                 0
incorrect               0
correct_char_count      0
incorrect_char_count    0
correct_word_count      0
incorrect_word_count    0
dtype: int64

In [37]:
# final_df.to_csv('/content/drive/MyDrive/Self Case studies/CS02 Grammar Error Corrector/data/final_df_preprocessed_2021111201.csv',index=False)