# Perturbation

In this phase i'll face the problem of the data perturbation. 
I take the documents from dataset and perturbate them with different kind of error: 

1. Spelling Errors
2. Grammatical Errors
3. Word - Segmentation Errors
4. Spurious - Character Insertion
5. Mix of previous errors. 

In [1]:
import pymongo

client = pymongo.MongoClient("mongodb://localhost:27017")
db = client["vatican"]
dataset = db['texts']
sentenceDataset = db['sentence']
perturbedDataset = db['phrasesDataset']

docs = list(dataset.find())[:150]

In [2]:
#sentenceDataset.drop()
#perturbedDataset.drop()

In [3]:
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from tqdm.notebook import tqdm
import random

In [4]:
def tokenizeDoc(doc):
    item = {'docId': doc['_id'], 'type': doc['type'], 'sentences': []}
    sentenceList = []

    for j, s in enumerate(sent_tokenize(doc['text'].lower())): 
        sentenceList.append(word_tokenize(s))
    item['sentences'] = sentenceList
    
    return item

In [5]:
for i, d in tqdm(enumerate(docs)): 
    res = tokenizeDoc(d)
    sentenceDataset.insert_one(res)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [6]:
sentenceDataset.find_one()

{'_id': ObjectId('60e02d38bbdcbc7192b77379'),
 'docId': ObjectId('60b7f1a21effca9356420fc1'),
 'type': 'Homilies',
 'sentences': [['solennità',
   "dell\\'immacolata",
   'concezione',
   'della',
   'beata',
   'vergine',
   'maria',
   'omelia',
   'del',
   'santo',
   'padre',
   'giovanni',
   'xxiii',
   'basilica',
   'vaticana',
   'lunedì',
   ',',
   '8',
   'dicembre',
   '1982',
   'diletti',
   'figli',
   'nostri',
   '!'],
  ['per',
   'ogni',
   'giorno',
   'della',
   'vita',
   ',',
   'il',
   'signore',
   'riserva',
   'alle',
   'anime',
   'nostre',
   'le',
   'sue',
   'consolazioni',
   'e',
   'le',
   'sue',
   'grazie',
   '.'],
  ['questo',
   ',',
   'dedicato',
   'al',
   'culto',
   'di',
   'maria',
   'nel',
   'suo',
   'altissimo',
   'privilegio',
   "dell\\'immacolato",
   'concepimento',
   ',',
   'è',
   'per',
   'il',
   'nostro',
   'spirito',
   'uno',
   'dei',
   'più',
   'fausti',
   'di',
   'questi',
   'ultimi',
   'mesi',
   '

In [7]:
sentenceDataset.count()

  sentenceDataset.count()


150

In [8]:
#sentenceDataset.drop()

In [9]:
errors = {
    'spurious': 0,
    'spelling': 0, 
    'grammatical': 0, 
    'wordSegmentation': 0
}

## Spurious Character Insertion

Insert of some random character in some random position inside some random picking word in the documents. 

In [10]:
# Potrebbe essere necessario diminuire il numero di frasi in cui viene inserito un errore

In [11]:
import string
characters = string.ascii_lowercase

In [12]:
def spurious(s):
    if random.randint(0, 10) % 2 == 0:
        return s

    global errors
    
    posInSentence = random.randint(0, len(s) - 1) 
    posInWord = random.randint(0, len(s[posInSentence]) - 1)
    w = s[posInSentence]
    
    if(len(s[posInSentence]) == 1):
        return s
    
    s[posInSentence] = "".join((w[:posInWord], characters[random.randint(0, len(characters) - 1)], w[posInWord:]))
    
    errors['spurious'] += 1
    
    print('Spurious Character inserted')
    return s

## Spelling Errors

Are recurrents errors during the writing of a word. 

In [13]:
def spelling(s):
    if(random.randint(0, 10) % 2 == 0):
        return s
    
    global errors
    
    posInSentence = random.randint(0, len(s) - 1) 
    posInWord = random.randint(0, len(s[posInSentence]) - 1)
    w = s[posInSentence]

    if(len(s[posInSentence]) == 1):
        return s
    
    s[posInSentence] = "".join((w[:posInWord], characters[random.randint(0, len(characters) - 1)], w[posInWord+1:]))
    
    errors['spelling'] += 1
    
    print('Spelling Error inserted')
    
    return s

## Grammatical Errors

This kind of errors concern articles, aphostrophes and accents ecc. 
I've compiled a list of possible grammatical errors (in form of dictionary). When i found a word / phrase / character that contains the correct version of the error i replace the correct version with the wrong version (obviously not for each occurence). 

In [14]:
def grammaticalErrorsSet():
    gErrorsWords = {
        'ha': ['a'],
        'hanno': ['anno'], 
        'ho': ['o'],
        'hai': ['ai']
        }
    gErrorsChar = {
        'ù': ['u', 'u\''],
        'è': ['e', 'e\''],
        'é': ['e', 'è', 'e\''],
        'ò': ['o', 'o\''], 
        'à': ['a', 'a\''],
        'ì': ['i', 'i\''],
        '\'':  ['']
    }
    
    return gErrorsWords, gErrorsChar

In [15]:
import re
import unicodedata

def grammatical(s):
    global errors
    
    gErrorsWords, gErrorsChar = grammaticalErrorsSet()
    
    for k, w in enumerate(s):
        if(random.randint(0, 100) < 80):
            return s
        
        if len(w) == 1:
            return s
        
        for err in gErrorsWords: 
            if w == err: 
                s[k] = (gErrorsWords.get(err))[random.randint(0, len(gErrorsWords.get(err)) - 1)]
                
                print('Grammatical Error inserted')
                
                errors['grammatical'] += 1

        for err in gErrorsChar:
            ms = re.finditer(unicodedata.normalize('NFKD', err), w)
            mp = [m.start() for m in ms ]

            if len(mp) > 0:
                pos = mp[random.randint(0, len(mp) - 1)] if (len(mp) > 1) else mp[0]
                item = random.randint(0, len(gErrorsChar.get(err)) - 1)

                s[k] = "".join(
                    (w[:pos],
                    gErrorsChar.get(err)[0] if (len(err) == 0) else gErrorsChar.get(err)[item],
                    w[pos+1:])
                )
                
                print('Grammatical Error inserted')
                
                errors['grammatical'] += 1
            
    return s

## Word Segmentation Error




In [16]:
def wordSegmentation(s):
    print('Word Segmentation Error inserted')
    
    #inserire conteggio errori
    
    return s

## Perturbation

In [17]:
docs = sentenceDataset.find()

for i, d in tqdm(enumerate(docs)):
    listOfPerturbation = []
    for j, s in enumerate(d['sentences']):
        gT = s.copy()
        wordSegmentation(grammatical(spelling(spurious(s))))
        
        listOfPerturbation.append([s, gT])
    item = {'docId': d['docId'], 'type': d['type'], 'sentences': listOfPerturbation}
    perturbedDataset.insert_one(item)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error ins

Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Grammatical Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word S

Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling E

Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted

Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Grammatical Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Wo

Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Grammatical Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Wor

Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Spurious Character inserted
Spelling Error inserted
Word Segmentation Error inserted
Word Segmentation 

In [18]:
print("ERRORS: {}\nSPURIOUS: {}\nSPELLING: {}\nGRAMMATICAL: {}\nWORD SEGMENTATION: {}".format(
    sum(errors.values()),
    errors.get('spurious'), errors.get('spelling'), errors.get('grammatical'), errors.get('wordSegmentation'))
)

docs = sentenceDataset.find()
counter = 0

for d in docs:
    for s in d['sentences']:
        for w in s:
            counter += 1
            
print("Parole totali: ", counter)
print("Error rate: ", sum(errors.values())/counter)

ERRORS: 2890
SPURIOUS: 1411
SPELLING: 1401
GRAMMATICAL: 78
WORD SEGMENTATION: 0
Parole totali:  124198
Error rate:  0.023269295801864764


In [19]:
#perturbedDataset.find_one()

# DB 

In [20]:
print(sentenceDataset.count())
print(perturbedDataset.count())

150
150


  print(sentenceDataset.count())
  print(perturbedDataset.count())


In [21]:
#perturbedDataset.find_one()

In [22]:
#sentenceDataset.drop()
#perturbedDataset.drop()
#perturbedWordsDataset.drop()