# Perturbation

In this phase i'll face the problem of the data perturbation. 
I take the documents from dataset and perturbate them with different kind of error: 

1. Spelling Errors
2. Grammatical Errors
3. Word - Segmentation Errors
4. Spurious - Character Insertion
5. Mix of previous errors. 

In [1]:
import pymongo
import random

from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from tqdm.notebook import tqdm

In [2]:
client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["vatican"]
dataset = db['texts']
perturbedDataset = db['perturbedDataset']

docs = list(dataset.find())[:150]

In [3]:
def tokenizeDoc(doc):
    item = {'docId': doc['_id'], 'type': doc['type'], 'sentences': []}
    sentenceList = []

    for j, s in enumerate(sent_tokenize(doc['text'].lower())): 
        sentenceList.append(word_tokenize(s))
    item['sentences'] = sentenceList
    
    return item

In [4]:
sentences = []

for i, d in tqdm(enumerate(docs)): 
    sentences.append(tokenizeDoc(d))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [5]:
errors = {
    'spurious': 0,
    'spelling': 0, 
    'grammatical': 0, 
    'wordSegmentation': 0
}

## Spurious Character Insertion

Insert of some random character in some random position inside some random picking word in the documents. 

In [6]:
# Potrebbe essere necessario diminuire il numero di frasi in cui viene inserito un errore

In [7]:
import string
characters = string.ascii_lowercase

In [8]:
def spurious(s):
    if random.randint(0, 10) % 2 == 0:
        return s

    global errors
    
    posInSentence = random.randint(0, len(s) - 1) 
    posInWord = random.randint(0, len(s[posInSentence]) - 1)
    w = s[posInSentence]
    
    if(len(s[posInSentence]) == 1):
        return s
    
    s[posInSentence] = "".join((w[:posInWord], characters[random.randint(0, len(characters) - 1)], w[posInWord:]))
    
    errors['spurious'] += 1
    
    print('Spurious Character inserted')
    return s

## Spelling Errors

Are recurrents errors during the writing of a word. 

In [9]:
def spelling(s):
    if(random.randint(0, 10) % 2 == 0):
        return s
    
    global errors
    
    posInSentence = random.randint(0, len(s) - 1) 
    posInWord = random.randint(0, len(s[posInSentence]) - 1)
    w = s[posInSentence]

    if(len(s[posInSentence]) == 1):
        return s
    
    s[posInSentence] = "".join((w[:posInWord], characters[random.randint(0, len(characters) - 1)], w[posInWord+1:]))
    
    errors['spelling'] += 1
    
    print('Spelling Error inserted')
    
    return s

## Grammatical Errors

This kind of errors concern articles, aphostrophes and accents ecc. 
I've compiled a list of possible grammatical errors (in form of dictionary). When i found a word / phrase / character that contains the correct version of the error i replace the correct version with the wrong version (obviously not for each occurence). 

In [10]:
def grammaticalErrorsSet():
    gErrorsWords = {
        'ha': ['a'],
        'hanno': ['anno'], 
        'ho': ['o'],
        'hai': ['ai']
        }
    gErrorsChar = {
        'ù': ['u', 'u\''],
        'è': ['e', 'e\''],
        'é': ['e', 'è', 'e\''],
        'ò': ['o', 'o\''], 
        'à': ['a', 'a\''],
        'ì': ['i', 'i\''],
        '\'':  ['']
    }
    
    return gErrorsWords, gErrorsChar

In [11]:
import re
import unicodedata

def grammatical(s):
    global errors
    
    gErrorsWords, gErrorsChar = grammaticalErrorsSet()
    
    for k, w in enumerate(s):
        if(random.randint(0, 100) < 80):
            return s
        
        if len(w) == 1:
            return s
        
        for err in gErrorsWords: 
            if w == err: 
                s[k] = (gErrorsWords.get(err))[random.randint(0, len(gErrorsWords.get(err)) - 1)]
                
                print('Grammatical Error inserted')
                
                errors['grammatical'] += 1

        for err in gErrorsChar:
            ms = re.finditer(unicodedata.normalize('NFKD', err), w)
            mp = [m.start() for m in ms ]

            if len(mp) > 0:
                pos = mp[random.randint(0, len(mp) - 1)] if (len(mp) > 1) else mp[0]
                item = random.randint(0, len(gErrorsChar.get(err)) - 1)

                s[k] = "".join(
                    (w[:pos],
                    gErrorsChar.get(err)[0] if (len(err) == 0) else gErrorsChar.get(err)[item],
                    w[pos+1:])
                )
                
                print('Grammatical Error inserted')
                
                errors['grammatical'] += 1
            
    return s

## Word Segmentation Error




In [12]:
def wordSegmentation(s):
    print('Word Segmentation Error inserted')
    
    #inserire conteggio errori
    
    return s

## Perturbation

In [19]:
for i, d in tqdm(enumerate(sentences)):
    listOfPerturbation = []
    for j, s in enumerate(d['sentences']):
        gT = s.copy()
        wordSegmentation(grammatical(spelling(spurious(s))))
        
        listOfPerturbation.append([s, gT])
    item = {'docId': d['docId'], 'type': d['type'], 'sentences': listOfPerturbation}
    perturbedDataset.insert_one(item)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word

In [15]:
print("ERRORS: {}\nSPURIOUS: {}\nSPELLING: {}\nGRAMMATICAL: {}\nWORD SEGMENTATION: {}".format(
    sum(errors.values()),
    errors.get('spurious'), errors.get('spelling'), errors.get('grammatical'), errors.get('wordSegmentation'))
)

counter = 0

for d in sentences:
    for s in d['sentences']:
        for w in s:
            counter += 1
            
print("Parole totali: ", counter)
print("Error rate: ", sum(errors.values())/counter)

ERRORS: 2915
SPURIOUS: 1404
SPELLING: 1411
GRAMMATICAL: 100
WORD SEGMENTATION: 0
Parole totali:  124198
Error rate:  0.023470587288040065


# DB 

In [16]:
print(perturbedDataset.count())

150


  print(perturbedDataset.count())


In [17]:
if(False):
    perturbedDataset.drop()