###**1. Load and Unzip BIGPATENT Data**

####Need to run only once

In [None]:
#!sudo apt-get install -y gzip

In [None]:
#!tar -xf '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData.tar.gz'

In [None]:
#!mv /content/bigPatentData '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData'

In [None]:
#!tar -xvzf '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData/train.tar.gz'

In [None]:
#!tar -xvzf '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData/test.tar.gz'

In [None]:
#!tar -xvzf '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData/val.tar.gz'

In [None]:
#import json
#import gzip
#import os
#import sys

In [None]:
def readData(input_path,split_type):
  cpc_codes = ['a','b','c','d','e','f','g','h','y']
  data = []
  for code in cpc_codes:
    file_names = os.listdir(os.path.join(input_path,split_type,code))
    # reading one of the gz files.
    file_name = file_names[0]
    print("Reading file "+ file_name + " from "+ split_type+" split for cpc code " + code)
    
    with gzip.open(os.path.join(input_path,split_type,code,file_name),'r') as fin:
        for row in fin:
            json_obj = json.loads(row)
            json_obj['cpc_code'] = code
            data.append(json_obj)
  return data

####Saving data as dataframes to Disk

In [None]:
#df_train = pd.DataFrame(data=readData('/content/','train'))
#df_test = pd.DataFrame(data=readData('/content/','test'))
#df_val = pd.DataFrame(data=readData('/content/','val'))

In [None]:
#df_train.to_csv('/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData_csv/train.csv',index=False)
#df_test.to_csv('/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData_csv/test.csv',index=False)
#df_val.to_csv('/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData_csv/val.csv',index=False)

###**2. Methods to introduce grammatical errors in the dataset**

In [None]:
#!pip install pattern

In [None]:
#!pip install inflect==5.4.0

In [None]:
import pandas as pd
import numpy as np
import spacy
from random import randint
import random
from pattern.en import lexeme
import inflect
import re

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/bigPatentData_csv/train.csv')

In [None]:
df.columns

Index(['publication_number', 'abstract', 'application_number', 'description',
       'cpc_code'],
      dtype='object')

In [None]:
df.abstract[1]

'the present invention provides a polypeptide tf1 for inhibiting type - 2 shiga - toxin activity , an encoding gene for the same and use thereof . the present polypeptide is named tf1 ; its amino acid sequence is shown in sequence 1 in the sequence list . the polypeptide p1 can be prepared into medicine for preventing and / or treating diseases caused by type - 2 shiga toxin or the pathogens which produce type - 2 shiga toxin .'

In [None]:
df_with_errors = df.sample(frac=0.3,random_state=3)
df_wo_errors = test_df = df[~df.index.isin(df_with_errors.index)]

In [None]:
df_with_errors.shape

(3712, 5)

In [None]:
df_wo_errors.shape

(8663, 5)

In [None]:
error_labels = np.zeros(shape=(df_with_errors.shape[0],6),dtype=int)
abstracts = np.array(df_with_errors.abstract)

####2.1 Error 1: Verb form

In [None]:
def introduce_incorrect_verbs(abs_doc):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(abs_doc)
  arr_doc = []
  token_indexes = []
  
  #Find tokens Indexes that are verbs
  for token in doc:
    arr_doc.append(token.text)
    if token.pos_ == 'VERB':
      token_indexes.append(int(token.i))

  #Randomly select how many incorrect verbs are to be generated
  how_many_verbs = randint(0,len(token_indexes)-1)
  random_tokens = random.sample(token_indexes,how_many_verbs)

  #Changing verbs in the document, introducing incorrect verbs error
  changed = False
  for i in random_tokens:
      verb = str(doc[i])
      lexical_verbs = lexeme(verb)
      if len(lexical_verbs) > 2:
        try:
          lexical_verbs.remove(verb)
        except ValueError:
          pass
        print(lexical_verbs)
        arr_doc[i] = lexical_verbs[randint(0,len(lexical_verbs)-1)]
        changed = True


  return changed,' '.join([str(i) for i in arr_doc])

####2.2 Error 2: Singluar-Plural Error

In [None]:
def introduce_singular_plural_error(abs_doc):
  p = inflect.engine()
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(abs_doc)
  arr_doc = []
  token_indexes = []
  
  #Find tokens Indexes that are verbs
  for token in doc:
    arr_doc.append(token.text)
    if token.pos_ == 'NOUN' or token.pos_ == 'PRON':
      token_indexes.append(int(token.i))
  
  #Randomly select how many incorrect singulars/plurals are to be generated
  how_many_words = randint(1,len(token_indexes)-1)
  random_tokens = random.sample(token_indexes,how_many_words)

  #Changing words in the document, introducing singular/plural error
  for i in random_tokens:
    word = str(doc[i])
    sing = p.singular_noun(str(word))
    if sing == False:
      arr_doc[i] = p.plural(word)
    else:
      arr_doc[i] = p.singular_noun(word)
  return ' '.join([str(i) for i in arr_doc])

####2.3 Error 3: Article Error

In [None]:
def introduce_article_error(abs_doc):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(abs_doc)
  arr_doc = []
  token_indexes = []
  
  #Find tokens Indexes that are verbs
  for token in doc:
    arr_doc.append(token.text)
    if token.pos_ == 'DET':
      token_indexes.append(int(token.i))
  changed = False
  #Randomly select how many incorrect verbs are to be generated
  if len(token_indexes) > 1:
    how_many_words = randint(1,len(token_indexes)-1)
    random_tokens = random.sample(token_indexes,how_many_words)

    #Changing articles in the document, introducing article error
    for i in random_tokens:
      article_word = str(doc[i])
      articles = ['a','an','the']
      try:
        articles.remove(article_word)
      except ValueError:
        pass
      
      arr_doc[i] = random.sample(articles,1)[0]
      changed = True

  return changed,' '.join([str(i) for i in arr_doc])

####2.4 Incorrect word use error

In [None]:
def introduce_incorrect_word_error(abs_doc):
  file_name = '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/homonyms.csv'
  lines = []
  with open(file_name,'rt',encoding='utf-8-sig') as file:
    lines = file.readlines()
  homonyms = [i.split(',') for i in [re.sub('"','',line.strip()) for line in lines]]
  for i in range(len(homonyms)):
    if (len(homonyms[i]) < 4):
      how_many_missing = 4 - len(homonyms[i])
      for j in range(how_many_missing):
        homonyms[i].append('NaN')

  
  arr_doc = abs_doc.split(' ')
  columns = list(zip(*homonyms))
 
  col1_matching = []
  col2_matching = []
  col3_matching = []
  col4_matching = []

  unique_words = []
  for i in range(len(arr_doc)):

    if arr_doc[i] in columns[3] and arr_doc[i] not in unique_words:
      col4_matching.append(i)
      unique_words.append(arr_doc[i])
    elif arr_doc[i] in columns[2] and arr_doc[i] not in unique_words:
      col3_matching.append(i)
      unique_words.append(arr_doc[i])
    elif arr_doc[i] in columns[1] and arr_doc[i] not in unique_words:
      col2_matching.append(i)
      unique_words.append(arr_doc[i])
    elif arr_doc[i] in columns[0] and arr_doc[i] not in unique_words:
      col1_matching.append(i)
      unique_words.append(arr_doc[i])
    
  
  replacable_word_count = len(col1_matching)+len(col2_matching)+len(col3_matching)+len(col4_matching)

  random_num = randint(0,replacable_word_count)

  flag = 0
  if len(col1_matching) > 0:
    for ind in col1_matching:
      index_in_col_1 = columns[0].index(arr_doc[ind])
      print("Replacing ",arr_doc[ind]," with ",columns[1][index_in_col_1])
      replaced_text = columns[1][index_in_col_1]
      arr_doc[ind] = replaced_text
      print(arr_doc[ind])
      flag = flag + 1
      if flag == random_num:
        break
  
  if len(col2_matching) > 0:
    for ind in col2_matching:
      index_in_col_2 = columns[1].index(arr_doc[ind])
      print("Replacing ",arr_doc[ind]," with ",columns[0][index_in_col_2])
      replaced_text = columns[0][index_in_col_2]
      arr_doc[ind] = replaced_text.strip()
      print(arr_doc[ind])
      flag = flag + 1
      if flag == random_num:
        break

  if len(col3_matching) > 0:
    for ind in col3_matching:
      index_in_col_3 = columns[2].index(arr_doc[ind])
      print("Replacing ",arr_doc[ind]," with ",columns[1][index_in_col_3])
      replaced_text = columns[1][index_in_col_3]
      arr_doc[ind] = replaced_text.strip()
      print(arr_doc[ind])
      flag = flag + 1
      if flag == random_num:
        break
  if len(col4_matching) > 0:
    for ind in col4_matching:
      index_in_col_4 = columns[3].index(arr_doc[ind])
      print("Replacing ",arr_doc[ind]," with ",columns[0][index_in_col_4])
      replaced_text = columns[0][index_in_col_4]
      arr_doc[ind] = replaced_text.strip()
      print(arr_doc[ind])
      flag = flag + 1
      if flag == random_num:
        break

  return ' '.join([i.strip() for i in arr_doc])

In [None]:
introduce_incorrect_word_error(abstracts[5])

Replacing  to  with  too
too
Replacing  review  with  revue
revue
Replacing  for  with  fore
fore


'the present invention relates too circuit defect detection , classification , and revue in the wafer stage of the integrated circuit semiconductor device manufacturing process . the method of processing integrated circuit semiconductor dice on a wafer in a manufacturing process fore dice comprising the steps of visually inspecting said dice on said wafer to determine defects thereon , summarizing the number , types , and ranges of sizes of the defects of said dice on said wafer , and determining if said wafer is acceptable to proceed in said manufacturing process .'

In [None]:
"['fore']"

"['fore']"

####2.5 Spelling errors

In [None]:
def introduce_spelling_errors(abs_doc):
  file_name = '/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/Annotations.csv'
  lines = []
  with open(file_name,'rt',encoding='utf-8-sig') as file:
    lines = file.readlines()
  misspell_data = [i.split(',') for i in [re.sub('"','',line.strip()) for line in lines]]
  misspell_data = misspell_data[1:]
  columns = list(zip(*misspell_data))
  original_text = columns[4]
  misspelled_text = columns[2]

  words_eligble = []
  arr_doc = abs_doc.split(' ')

  inserted_index = []
  for i in range(len(arr_doc)):
    if arr_doc[i] in original_text:
      index_in_org = original_text.index(arr_doc[i])
      if index_in_org not in inserted_index:
        words_eligble.append([i,index_in_org])
        inserted_index.append(index_in_org)
  
  
  random_error_count = randint(1,len(words_eligble))
  words_to_be_replaced = random.sample(words_eligble,random_error_count)
  for i in words_to_be_replaced:
    print("Replacing ",arr_doc[i[0]], " with ",misspelled_text[i[1]])
    arr_doc[i[0]] = misspelled_text[i[1]]

  return ' '.join([str(i) for i in arr_doc])

In [None]:
introduce_spelling_errors(abstracts[5])

Replacing  in  with  ?n
Replacing  determine  with  detrermine
Replacing  and  with  und
Replacing  said  with  saied
Replacing  method  with  methord
Replacing  the  with  rthe
Replacing  acceptable  with  aceptable
Replacing  of  with  pf
Replacing  to  with  ti
Replacing  manufacturing  with  manifacturing
Replacing  on  with  om
Replacing  number  with  numer
Replacing  for  with  foir
Replacing  present  with  pesent
Replacing  types  with  tipes
Replacing  process  with  procces
Replacing  if  with  uf
Replacing  invention  with  invetion


'rthe pesent invetion relates ti circuit defect detection , classification , und review ?n the wafer stage pf the integrated circuit semiconductor device manifacturing procces . the methord of processing integrated circuit semiconductor dice om a wafer in a manufacturing process foir dice comprising the steps of visually inspecting saied dice on said wafer to detrermine defects thereon , summarizing the numer , tipes , and ranges of sizes of the defects of said dice on said wafer , and determining uf said wafer is aceptable to proceed in said manufacturing process .'

###3. Efficiently Introduce Errors

In [None]:
count = int(0.3*abstracts.shape[0])

In [None]:
abstracts_random_indexes = np.random.randint(0,abstracts.shape[0],count)
for i in abstracts_random_indexes:
  flag, abstract_w_error = introduce_incorrect_verbs(abstracts[i])
  if flag == True:
    abstracts[i] = abstract_w_error
    error_labels[i,0]=1
    error_labels[i,5]=1

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
['provide', 'provides', 'providing']
['adjust', 'adjusts', 'adjusted']
['connect', 'connects', 'connected']
['fees', 'feeing']
['maintains', 'maintaining', 'maintained']
['enable', 'enables', 'enabled']
['desire', 'desires', 'desiring']
['allows', 'allowing', 'allowed']
['decrease', 'decreases', 'decreasing']
['increase', 'increases', 'increasing']
['achieve', 'achieves', 'achieving']
['demonstrate', 'demonstrating', 'demonstrated']
['allows', 'allowing', 'allowed']
['precipitates', 'precipitating', 'precipitated']
['administer', 'administers', 'administering']
['res', 'ring', 'red']
['requires', 'requiring', 'required']
['dissolves', 'dissolving', 'dissolved']
['insures', 'insuring', 'insured']
['suspends', 'suspending', 'suspended']
['deliver', 'delivers', 'delivered']
['-s', '-ing', '-ed']
['draw', 'draws', 'drawing', 'drew']
['cause', 'causes', 'caused']
['cause', 'causing', 'caused']
['raise', 'raises', 'raising']
['

In [None]:
abstracts_random_indexes = np.random.randint(0,abstracts.shape[0],count)
print("Generating errors in ",len(abstracts_random_indexes)," documents.")
for i in abstracts_random_indexes:
  print("Introducing errors in document ",i)
  abstract_w_error = introduce_singular_plural_error(abstracts[i])
  abstracts[i] = abstract_w_error
  error_labels[i,1]=1
  error_labels[i,5]=1

Generating errors in  1113  documents.
Introducing errors in document  419
Introducing errors in document  1067
Introducing errors in document  2921
Introducing errors in document  3473
Introducing errors in document  2302
Introducing errors in document  403
Introducing errors in document  2349
Introducing errors in document  684
Introducing errors in document  3694
Introducing errors in document  3640
Introducing errors in document  318
Introducing errors in document  631
Introducing errors in document  1716
Introducing errors in document  3552
Introducing errors in document  536
Introducing errors in document  172
Introducing errors in document  536
Introducing errors in document  641
Introducing errors in document  648
Introducing errors in document  2449
Introducing errors in document  1908
Introducing errors in document  2584
Introducing errors in document  3322
Introducing errors in document  2744
Introducing errors in document  105
Introducing errors in document  1713
Introducin

In [None]:
abstracts_random_indexes = np.random.randint(0,abstracts.shape[0],count)
print("Generating errors in ",len(abstracts_random_indexes)," documents.")
count = 1
for i in abstracts_random_indexes:
  print("Working on document [",count,"/",len(abstracts_random_indexes),"]",end = '',flush = True)
  flag, abstract_w_error = introduce_article_error(abstracts[i])
  count = count+1
  if flag == True:
    abstracts[i] = abstract_w_error
    error_labels[i,2]=1
    error_labels[i,5]=1

Generating errors in  1113  documents.
Working on document [ 1 / 1113 ]Working on document [ 2 / 1113 ]Working on document [ 3 / 1113 ]Working on document [ 4 / 1113 ]Working on document [ 5 / 1113 ]Working on document [ 6 / 1113 ]Working on document [ 7 / 1113 ]Working on document [ 8 / 1113 ]Working on document [ 9 / 1113 ]Working on document [ 10 / 1113 ]Working on document [ 11 / 1113 ]Working on document [ 12 / 1113 ]Working on document [ 13 / 1113 ]Working on document [ 14 / 1113 ]Working on document [ 15 / 1113 ]Working on document [ 16 / 1113 ]Working on document [ 17 / 1113 ]Working on document [ 18 / 1113 ]Working on document [ 19 / 1113 ]Working on document [ 20 / 1113 ]Working on document [ 21 / 1113 ]Working on document [ 22 / 1113 ]Working on document [ 23 / 1113 ]Working on document [ 24 / 1113 ]Working on document [ 25 / 1113 ]Working on document [ 26 / 1113 ]Working on document [ 27 / 1113 ]Working on document [ 28 / 1113 ]Working on document [ 29 / 1113 ]Working on do

In [None]:
from tqdm.notebook import tqdm_notebook
import time

In [None]:
abstracts_random_indexes = np.random.randint(0,abstracts.shape[0],count)
print("Generating errors in ",len(abstracts_random_indexes)," random documents.")
count = 1
for i in tqdm_notebook(abstracts_random_indexes):
  
  abstract_w_error = introduce_incorrect_word_error(abstracts[i])
  count = count+1
  abstracts[i] = abstract_w_error
  error_labels[i,3]=1
  error_labels[i,5]=1

Generating errors in  1114  random documents.


  0%|          | 0/1114 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Replacing  bee  with  be
be
Replacing  for  with  fore
fore
Replacing  to  with  too
too
Replacing  right  with  rite
rite
Replacing  by  with  buy
buy
Replacing  for  with  fore
fore
Replacing  to  with  too
too
Replacing  by  with  buy
buy
Replacing  to  with  too
too
Replacing  for  with  fore
fore
Replacing  sealing  with  ceiling
ceiling
Replacing  its  with  it's
it's
Replacing  by  with  buy
buy
Replacing  to  with  too
too
Replacing  for  with  fore
fore
Replacing  or  with  oar
oar
Replacing  martial  with  marshal
marshal
Replacing  two  with  too
too
Replacing  for  with  fore
fore
Replacing  to  with  too
too
Replacing  more  with  moor
moor
Replacing  or  with  oar
oar
Replacing  by  with  buy
buy
Replacing  through  with  threw
threw
Replacing  for  with  fore
fore
Replacing  to  with  too
too
Replacing  so  with  sew
sew
Replacing  or  with  oar
oar
Replacing  oar  with  or
or
Replacing  to  with  too
too
R

In [None]:
abstracts_random_indexes = np.random.randint(0,abstracts.shape[0],count)
for i in tqdm_notebook(abstracts_random_indexes):
  abstract_w_error = introduce_spelling_errors(abstracts[i])
  abstracts[i] = abstract_w_error
  error_labels[i,4]=1
  error_labels[i,5]=1

  0%|          | 0/1115 [00:00<?, ?it/s]

In [None]:
df_abstracts_errors = pd.DataFrame({'abstracts':abstracts,'err_1':error_labels[:,0],'err_2':error_labels[:,1],'err_3':error_labels[:,2],'err_4':error_labels[:,3],'err_5':error_labels[:,4],'grammatically_incorrect':error_labels[:,5],})

In [None]:
labels_wo_errors = np.zeros(shape=(df_wo_errors.shape[0],6),dtype=int)
abstracts_correct = np.array(df_wo_errors.abstract)
df_abstracts_correct = pd.DataFrame({'abstracts':abstracts_correct,'err_1':labels_wo_errors[:,0],'err_2':labels_wo_errors[:,1],'err_3':labels_wo_errors[:,2],'err_4':labels_wo_errors[:,3],'err_5':labels_wo_errors[:,4],'grammatically_incorrect':labels_wo_errors[:,5],})
df_final = pd.concat([df_abstracts_errors,df_abstracts_correct])
df_final = df_final.sample(frac=1, random_state=21)

In [None]:
df_final.to_csv('/content/drive/MyDrive/Colab Notebooks/thesis_PQAI/grammer_error_dataset/train.csv',index=False)

In [None]:
df_final.sample(frac=0.001,random_state=57)

Unnamed: 0,abstracts,err_1,err_2,err_3,err_4,err_5,grammatically_incorrect
2633,"rthe portable device foir secured loads , incl...",1,0,1,0,1,1
5555,an air conditioning system with a common refri...,0,0,0,0,0,0
3270,a instruction translation look - aside buffer ...,0,0,1,0,0,1
1050,process und apparatus foir the gro pf films of...,0,0,0,0,1,1
1167,an image forming apparatus having a main body ...,0,0,0,0,0,0
1993,a lever electromechanical valve actuator assem...,0,0,0,0,0,0
7869,a bracket and sensor body of a magnetic sensor...,0,0,0,0,0,0
1576,a corona treatment system has a corona treatme...,0,0,0,0,0,0
1269,remote sensing method and apparatus wherein sp...,0,0,1,0,0,1
6803,a method and system for generating channel est...,0,0,0,0,0,0
