## Import libraries & data

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()

In [2]:
final_df = pd.read_csv('C:/Users/prash/Desktop/New_now100/data/final_df_.csv')

In [3]:
final_df.sample(5)

Unnamed: 0,correct,incorrect
2561,When I can I enjoy visiting my family in Mexico.,"When I can, I enjoy visiting my family in Mexico."
5850,If you're tightening your head gear to the poi...,If you're tightening your headgear to the poin...
7023,"My aim is to raise some awareness, a transplan...","My aim in all this is to raise some awareness,..."
7752,REFERENCE TO GENERIC UNIT (Periodica): Harvard...,REFERENCE TO GENERIC UNIT (Periodica): Harvard...
2735,1057 products hot sale 2008 new moodel gold mi...,1057 products . hot sale 2008 new model gold m...


In [4]:
final_df.shape

(10003, 2)

### Adding length features

In [5]:
final_df['correct_char_count'] = final_df['correct'].astype('str').apply(lambda x:len(x))
final_df['incorrect_char_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x))

In [6]:
final_df['correct_word_count'] = final_df['correct'].astype('str').apply(lambda x:len(x.split()))
final_df['incorrect_word_count'] = final_df['incorrect'].astype('str').apply(lambda x:len(x.split()))

In [7]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
194,Here is the Case I Landing tipes paper.,Here is the Case I Landing Tips Paper.,39,38,8,8
6157,Trump said he's worried that central bank's po...,Trump said he's worried that the central bank'...,162,170,28,30
213,All rates is included 7% of goverment tax.,All rates are inclusive of 7% government tax.,42,45,8,8
6733,This is not the first time Ostean has defended...,This is not the first time Osteen has defended...,222,220,41,41
6057,"During Skill Competition He yiming, through a ...","During the Skill Competition, He Yiming, throu...",204,180,34,28


## Preprocessing

### Removing/Checking Missing/NA 

In [8]:
pd.DataFrame(final_df.isna().sum(),columns=['missing_count'])

Unnamed: 0,missing_count
correct,0
incorrect,0
correct_char_count,0
incorrect_char_count,0
correct_word_count,0
incorrect_word_count,0


In [9]:
final_df[final_df.isna().any(axis=1)]

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [10]:
final_df = final_df.dropna().reset_index(drop=True)

In [11]:
final_df.shape

(10003, 6)

In [12]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
7812,"Anyway, this woman is still drinking the Kool-...","Anyway, this woman is still drinking the Kool-...",238,250,38,40
818,Your dosage and frequency of administration wi...,Your dosage and frequency of administration wi...,113,115,16,17
134,Most all science these days gets done over clo...,Most science these days gets done behind close...,83,88,15,15
7696,"Just: my opinion of course, but the more we su...","Just my opinion of course, but the more we sup...",230,230,40,40
5441,Are being signed up for LIVING FRAGRANCE?,Signed up for LIVING FRAGRANCE?,41,31,7,5


### Keep unique sentence pairs

In [13]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 59


In [14]:
print(f"total number of duplicate pairs: {len(final_df[final_df['correct']==final_df['incorrect']])}")

total number of duplicate pairs: 59


In [15]:
final_df[final_df['correct']==final_df['incorrect']].sample(10)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
2312,Choice of Black Metal Wooden Leg in 3 colours.,Choice of Black Metal Wooden Leg in 3 colours.,46,46,9,9
3221,Bet that one stopped traffic on Collins Ave.,Bet that one stopped traffic on Collins Ave.,44,44,8,8
8106,"Seeing, feeling and sensing energy.","Seeing, feeling and sensing energy.",35,35,5,5
9791,How to apply + where to learn more.,How to apply + where to learn more.,35,35,8,8
8449,26:43 Offside against Rhode Island.,26:43 Offside against Rhode Island.,35,35,5,5
4607,"Toothbrushes, with their high profile, tend to...","Toothbrushes, with their high profile, tend to...",79,79,13,13
6588,pimple extractor tool new arrival face care st...,pimple extractor tool new arrival face care st...,299,299,42,42
8808,"crawler mobile crushing production line, cone ...","crawler mobile crushing production line, cone ...",55,55,8,8
8412,Whirl Into Winter Giveaway is Finally Here!,Whirl Into Winter Giveaway is Finally Here!,43,43,7,7
7450,Fill someone's hair-dryer with baby powder.,Fill someone's hair-dryer with baby powder.,43,43,6,6


In [16]:
final_df = final_df[final_df['correct']!=final_df['incorrect']]

In [17]:
final_df.shape

(9944, 6)

In [18]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
3937,"wonât be made here will it, because if Elles...","wonât be made here will it, because if Elles...",84,88,15,16
1891,2 protoing Brackets Ã¢â¬â This will be my m...,2. Mounting Brackets Ã¢â¬â These will make ...,137,144,26,28
6237,New Jersey Doo Wop group Harmony Singers Club ...,New Jersey Doo Wop Group Harmony Singers Club ...,57,54,10,10
896,Other revelations about the work of the ambula...,Other revelations about the work of the ambula...,166,176,27,29
5417,The famous view ... Oia on Santorini.,A famous view: Oia on Santorini.,37,32,7,6


### Remove Duplicates

In [19]:
print(f'total number of duplicates: {final_df.duplicated().sum()}')

total number of duplicates: 0


In [20]:
final_df[final_df.duplicated(keep=False)].sort_values('correct')

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count


In [21]:
final_df = final_df.drop_duplicates().reset_index(drop=True)

In [22]:
final_df.shape

(9944, 6)

In [23]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
2134,If any council want to charge more it have to ...,If any council wants to charge more it has to ...,115,112,21,21
2292,"We recognize Dave Long, founder of Staticworx,...","Dave Long, founder of Staticworx, is recognize...",96,97,14,14
4632,Thanks! We have been making for some time to g...,Thanks! We have been trying for some time to g...,73,80,15,16
7274,But when the listed of the participant of the ...,But when the list of the participants of the 2...,91,88,17,17
6123,the non-Judicial human right redress Mechanism...,the non-Judicial human rights redress Mechanis...,247,242,40,38


### Remove Small sentences



In [24]:
final_df[final_df['incorrect_char_count']<2].shape

(0, 6)

In [25]:
final_df = final_df[final_df['incorrect_char_count']>2].reset_index(drop=True)

In [26]:
final_df.shape

(9944, 6)

In [27]:
final_df[final_df['correct_char_count']<2].shape

(0, 6)

In [28]:
#final_df[final_df['correct_char_count']<2].sample(10)

In [29]:
final_df = final_df[final_df['correct_char_count']>2].reset_index(drop=True)

In [30]:
final_df.shape

(9944, 6)

### Clean text

In [31]:
#https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have","n\'t":" not","\'re":" are","\'s": " is","\'d":" would",
                     "\'ll": " will","\'t":" not","\'ve": " have","\'m":" am"}


# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

In [32]:
# https://stackoverflow.com/a/47091490/4084039
def clean(text):
    text = re.sub('\s*\<.*?\>\s', '', text)
    text = re.sub('\s*\(.*?\)\s', '', text)
    text = re.sub('\s*\[.*?\]\s', '', text)
    text = re.sub('\s*\{.*?\}\s', '', text)
    text = re.sub("[-+@#^/|*(){}$~<>=_%:;]","",text)
    text = text.replace("\\","")
    text = re.sub("\[","",text)
    text = re.sub("\]","",text)
    text = re.sub("\<","",text)
    text = re.sub("\>","",text)
    text = re.sub("\(","",text)
    text = re.sub("\)","",text)
    text = re.sub("[0-9]","",text)
    text = ' '.join(text.split())
    return text

In [33]:
final_df['correct'] = final_df['correct'].progress_apply(clean)
final_df['correct'] = final_df['correct'].progress_apply(expand_contractions)

  0%|          | 0/9944 [00:00<?, ?it/s]

  0%|          | 0/9944 [00:00<?, ?it/s]

In [34]:
final_df['incorrect'] = final_df['incorrect'].progress_apply(clean)
final_df['incorrect'] = final_df['incorrect'].progress_apply(expand_contractions)

  0%|          | 0/9944 [00:00<?, ?it/s]

  0%|          | 0/9944 [00:00<?, ?it/s]

In [35]:
final_df.sample(5)

Unnamed: 0,correct,incorrect,correct_char_count,incorrect_char_count,correct_word_count,incorrect_word_count
4932,"Kumar, A. â Petro Cropresources of Rajasthan...",". Kumar, A. â Petro Crop Resources of Rajast...",163,165,25,26
7849,The arrangement would go on to close out the d...,The arrangement would go on to close out the d...,219,223,33,34
6124,Should I provide my registration information a...,Should I provide my registration information a...,89,81,15,14
3023,Performer of Foreign Private TraderBest is of ...,Prop Trader Cover Letter Best Of Captivating P...,86,65,11,10
8009,We should be celebrating women every day for a...,We should be celebrating women every day for a...,203,221,37,41


In [36]:
final_df.isna().sum()

correct                 0
incorrect               0
correct_char_count      0
incorrect_char_count    0
correct_word_count      0
incorrect_word_count    0
dtype: int64

In [37]:
final_df.to_csv('C:/Users/prash/Desktop/New_now100/data/final_df_preprocessed_2021111201.csv',index=False)