# bertchunker: default program

In [4]:
from default import *
import os, sys

## Run the default solution on dev

In [5]:
chunker = FinetuneTagger(os.path.join('..', 'data', 'chunker'), modelsuffix='.pt')
decoder_output = chunker.decode(os.path.join('..', 'data', 'input', 'dev.txt'))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1027/1027 [00:20<00:00, 51.11it/s]


Ignore the warnings from the transformers library. They are expected to occur.

## Evaluate the default output

In [6]:
flat_output = [ output for sent in decoder_output for output in sent ]
sys.path.append('..')
import conlleval
true_seqs = []
with open(os.path.join('..', 'data', 'reference', 'dev.out')) as r:
    for sent in conlleval.read_file(r):
        true_seqs += sent.split()
conlleval.evaluate(true_seqs, flat_output)

processed 23663 tokens with 11896 phrases; found: 13226 phrases; correct: 9689.
accuracy:  87.04%; (non-O)
accuracy:  87.45%; precision:  73.26%; recall:  81.45%; FB1:  77.14
             ADJP: precision:  13.32%; recall:  53.98%; FB1:  21.37  916
             ADVP: precision:  31.16%; recall:  58.79%; FB1:  40.73  751
            CONJP: precision:   0.00%; recall:   0.00%; FB1:   0.00  8
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  11
              LST: precision:   0.00%; recall:   0.00%; FB1:   0.00  3
               NP: precision:  80.58%; recall:  80.86%; FB1:  80.72  6258
               PP: precision:  95.97%; recall:  86.93%; FB1:  91.23  2211
              PRT: precision:  22.15%; recall:  77.78%; FB1:  34.48  158
             SBAR: precision:  36.12%; recall:  80.17%; FB1:  49.80  526
              UCP: precision:   0.00%; recall:   0.00%; FB1:   0.00  64
               VP: precision:  83.75%; recall:  84.33%; FB1:  84.04  2320


(73.25722062603963, 81.44754539340954, 77.13557837751772)

## Documentation

Write some beautiful documentation of your program here.

As we can see, the current model is not very good at handling noise in the testing data. What we need to do is introduce mechanisms and strategies to
train the model to handle noise so that we can improve our F-score accuracy and create more robust chunking.

## Analysis

Do some analysis of the results. What ideas did you try? What worked and what did not?

The first method we will try is to add adversarial training to simulate noise functions to so that the model is better equipped to handle the testing
data. To do this, we will implement 6 noise functions into the chunker.py file to: substitute_character, omit_character, phonetic_mispelling, 
insert_random_character, keyboard_typo, and and swap_adjacent_characters. We will also implement a probability such that only 12% of words will have
noise added to them to start off with the training. 

In [None]:
import random 

# Function simulates typographical errors by substituting one character for another
def substitute_character(word):
    if len(word) > 1:
        index = random.randint(0, len(word) - 1)
        substitute_with = chr(random.randint(97, 122)) # ASCII range for lower case letters
        return word[:index] + substitute_with + word[index + 1:]
    return word

# Function simulates missing characters, common in fast typing
def omit_characters(word):
    if len(word) > 1:
        index = random.randint(0, len(word) - 1)
        return word[:index] + word[index + 1:]
    return word

# Function simulates phonetic errors

phonetic_similarities = {
    'c': ['k', 's'], 'k': ['c', 'q'], 's' : ['c', 'z', 'ss'], 
    'q' : ['k'], 'g' : ['j'], 'j' : ['g', 'ge'], 'kn' : ['n'],
    'n': ['kn'], 'gh' : ['f', 'ph'], 'f': ['gh', 'ph', 'ff', 'v'], 
    'ph' : ['gh', 'f'], 'wh' : ['w'], 'w' : ['wh'], 'v' : ['f'],
    'a' : ['e'], 'e' : ['a', 'i'], 'i' : ['e']
}

def phonetic_misspelling(word):
    for i, char in enumerate(word):
        if char in phonetic_similarities:
            substitutes = phonetic_similarities[char]
            substitute_with = random.choice(substitutes)
            return word[:i] + substitute_with + word[i + 1:]
    return word

# Function simulates accidental extra character insertion

def insert_random_character(word):
    if len(word) > 1:
        index = random.randint(0, len(word) - 1)
        insert_char = chr(random.randint(97, 122)) # ASCII range for lower case letters
        return word[:index] + insert_char + word[index:]
    return word

# Function simulates accidental keyboard typos

keyboard_adjacent = {
    'a': ['s', 'w', 'q'], 'b': ['v', 'g', 'h', 'n'], 'c' : ['x', 'd', 'f', 'v'],
    'd' : ['s', 'e', 'r', 'f', 'c', 'x'], 'e' : ['w', 's', 'd', 'r'], 
    'f' : ['d', 'r', 't', 'g', 'v', 'c'], 'g' : ['f', 't', 'y', 'h', 'b', 'v'],
    'h' : ['g', 'y', 'u', 'j', 'n', 'b'], 'i' : ['u', 'j', 'k', 'o'],
    'j' : ['h', 'u', 'i', 'k', 'm', 'n'], 'k' : ['j', 'i', 'o', 'l', 'm'],
    'l' : ['k', 'o', 'p'], 'm' : ['n', 'j', 'k'], 'n' : ['b', 'h', 'j', 'm'], 
    'o' : ['i', 'k', 'l', 'p'], 'p' : ['o', 'l'], 'q' : ['a', 'w'],
    'r' : ['e', 'd', 'f', 't'], 's' : ['a', 'w', 'e', 'd', 'x', 'z'], 
    't' : ['r', 'f', 'g', 'y'], 'u' : ['y', 'h', 'j', 'i'], 'v' : ['c', 'f', 'g', 'b'],
    'w' : ['q', 'a', 's', 'e'], 'x' : ['z', 's', 'd', 'c'], 'y' : ['t', 'g', 'h', 'u'],
    'z' : ['a', 's', 'x'] 
}

def keyboard_typo(word):
    if len(word) > 1:
        i = random.randint(0, len(word) - 1)
        if word[i] in keyboard_adjacent:
            typo = random.choice(keyboard_adjacent[word[i]])
            return word[:i] + typo + word[i + 1:]
    return word

# Function simulates character swap 

def swap_adjacent_characters(word):
    if len(word) > 1:
        # Select position to swap with the next character
        swap_pos = random.randint(0, len(word) - 2)  
        # Swap characters at swap_pos and swap_pos + 1
        word = word[:swap_pos] + word[swap_pos + 1] + word[swap_pos] + word[swap_pos + 2:]
    return word

The next thing to do, is to write a function to flag whether the model is in training mode or not so that we know whether to add the simulated noise
or not. 

In [None]:
def model_in_training_mode():
    path_to_search = os.path.join("..", "data", "chunker.pt")
    return not os.path.exists(path_to_search)

From the first training, our output is now: 
processed 23663 tokens with 11896 phrases; found: 11894 phrases; correct: 10786.
accuracy:  94.34%; (non-O)
accuracy:  94.67%; precision:  90.68%; recall:  90.67%; FB1:  90.68
             ADJP: precision:  74.65%; recall:  71.68%; FB1:  73.14  217
             ADVP: precision:  72.80%; recall:  72.61%; FB1:  72.70  397
            CONJP: precision:  44.44%; recall:  57.14%; FB1:  50.00  9
             INTJ: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
               NP: precision:  89.85%; recall:  91.94%; FB1:  90.88  6382
               PP: precision:  97.33%; recall:  94.22%; FB1:  95.75  2363
              PRT: precision:  76.60%; recall:  80.00%; FB1:  78.26  47
             SBAR: precision:  93.23%; recall:  75.53%; FB1:  83.45  192
               VP: precision:  91.03%; recall:  90.32%; FB1:  90.68  2286
dev.out score: 90.6768

The F-score has definitely improved. Now, it is time to further optimize the model.

The next step was to use two different optimizers with different learning rates for the pre-trained encoder layers and the classification head layer. For instance, the classification head parameters might be better learned with an SGD optimizer and a learning rate of 0.1. Thus, we changed the function 
init_model_from_scratch to:

In [None]:
def init_model_from_scratch(self, basemodel, tagset_size, lr):
        self.encoder = AutoModel.from_pretrained(basemodel)
        self.encoder_hidden_dim = self.encoder.config.hidden_size
        self.classification_head = nn.Linear(self.encoder_hidden_dim, tagset_size)
        # TODO initialize self.crf_layer in here as well.
        
        # TODO modify the optimizers in a way that each model part is optimized with a proper learning rate!
        encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = lr)
        head_optimizer = optim.SGD(self.classification_head.parameters(), lr = 0.10)

        self.optimizers = [encoder_optimizer, head_optimizer]

With this, we also had to consider increasing the number of epochs to 10 from 5 to handle the increase in complexity and diversity given simulated noise in training data. 

Now our output after retraining the model is:
processed 23663 tokens with 11896 phrases; found: 11821 phrases; correct: 10796.
accuracy:  94.17%; (non-O)
accuracy:  94.57%; precision:  91.33%; recall:  90.75%; FB1:  91.04
             ADJP: precision:  77.23%; recall:  69.03%; FB1:  72.90  202
             ADVP: precision:  79.43%; recall:  69.85%; FB1:  74.33  350
            CONJP: precision:  50.00%; recall:  57.14%; FB1:  53.33  8
             INTJ: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
               NP: precision:  90.97%; recall:  92.06%; FB1:  91.51  6312
               PP: precision:  97.33%; recall:  94.06%; FB1:  95.67  2359
              PRT: precision:  75.00%; recall:  80.00%; FB1:  77.42  48
             SBAR: precision:  92.86%; recall:  76.79%; FB1:  84.06  196
               VP: precision:  89.59%; recall:  91.19%; FB1:  90.39  2345
dev.out score: 91.0402

Its getting even better! The next plan, is to test out the probability of adding noise to a word, and applying different learning rates to see what values will further optimize the model. Given that the probability was 12% for adding noise to a word, lets test out going to the extreme and decreasing the probability to 0.05% to see if perhaps we are adding too much noise to the training data. 

In [None]:
def read_conll(handle, input_idx=0, label_idx=2):
    conll_data = []
    contents = re.sub(r'\n\s*\n', r'\n\n', handle.read())
    contents = contents.rstrip()
    for sent_string in contents.split('\n\n'):
        annotations = list(zip(*[ word_string.split() for word_string in sent_string.split('\n') ]))
        assert(input_idx < len(annotations))

        if label_idx < 0:
            conll_data.append( annotations[input_idx] )
            logging.info("CoNLL: {}".format( " ".join(annotations[input_idx])))
        else:
            assert(label_idx < len(annotations))

            noise_probability = 0.0005 # This will be adjusted as needed
            noisy_annotations = []

            if model_in_training_mode():
                for word in annotations[input_idx]:
                    if random.random() < noise_probability:
                        # Randomly choose a noise function and apply it to the word
                        word = random.choice([substitute_character, omit_characters, phonetic_misspelling, 
                                          insert_random_character, keyboard_typo, swap_adjacent_characters])(word)
            
                    noisy_annotations.append(word)
                annotations = list(annotations) # Convert tuple to list to modify contents
                annotations[input_idx] = tuple(noisy_annotations) # Replace words with noisy words

            conll_data.append((annotations[input_idx], annotations[label_idx] ))
            logging.info("CoNLL: {} ||| {}".format( " ".join(annotations[input_idx]), " ".join(annotations[label_idx])))
    return conll_data

Our output is now:
processed 23663 tokens with 11896 phrases; found: 11719 phrases; correct: 10432.
accuracy:  92.49%; (non-O)
accuracy:  93.06%; precision:  89.02%; recall:  87.69%; FB1:  88.35
             ADJP: precision:  74.47%; recall:  61.95%; FB1:  67.63  188
             ADVP: precision:  78.64%; recall:  66.58%; FB1:  72.11  337
            CONJP: precision:  75.00%; recall:  85.71%; FB1:  80.00  8
             INTJ: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
               NP: precision:  88.09%; recall:  89.16%; FB1:  88.62  6313
               PP: precision:  96.91%; recall:  89.96%; FB1:  93.31  2266
              PRT: precision:  72.92%; recall:  77.78%; FB1:  75.27  48
             SBAR: precision:  91.40%; recall:  71.73%; FB1:  80.38  186
               VP: precision:  86.76%; recall:  89.32%; FB1:  88.02  2372
dev.out score: 88.3506

It seems we have decreased the probabiity too much. Lets try and amp it up to perhaps 4%?

In [None]:
def read_conll(handle, input_idx=0, label_idx=2):
    conll_data = []
    contents = re.sub(r'\n\s*\n', r'\n\n', handle.read())
    contents = contents.rstrip()
    for sent_string in contents.split('\n\n'):
        annotations = list(zip(*[ word_string.split() for word_string in sent_string.split('\n') ]))
        assert(input_idx < len(annotations))

        if label_idx < 0:
            conll_data.append( annotations[input_idx] )
            logging.info("CoNLL: {}".format( " ".join(annotations[input_idx])))
        else:
            assert(label_idx < len(annotations))

            noise_probability = 0.04 # This will be adjusted as needed
            noisy_annotations = []

            if model_in_training_mode():
                for word in annotations[input_idx]:
                    if random.random() < noise_probability:
                        # Randomly choose a noise function and apply it to the word
                        word = random.choice([substitute_character, omit_characters, phonetic_misspelling, 
                                          insert_random_character, keyboard_typo, swap_adjacent_characters])(word)
            
                    noisy_annotations.append(word)
                annotations = list(annotations) # Convert tuple to list to modify contents
                annotations[input_idx] = tuple(noisy_annotations) # Replace words with noisy words

            conll_data.append((annotations[input_idx], annotations[label_idx] ))
            logging.info("CoNLL: {} ||| {}".format( " ".join(annotations[input_idx]), " ".join(annotations[label_idx])))
    return conll_data

Our output is now: 
processed 23663 tokens with 11896 phrases; found: 12053 phrases; correct: 11081.
accuracy:  95.43%; (non-O)
accuracy:  95.67%; precision:  91.94%; recall:  93.15%; FB1:  92.54
             ADJP: precision:  71.74%; recall:  73.01%; FB1:  72.37  230
             ADVP: precision:  70.74%; recall:  77.14%; FB1:  73.80  434
            CONJP: precision:  71.43%; recall:  71.43%; FB1:  71.43  7
             INTJ: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
               NP: precision:  92.01%; recall:  94.02%; FB1:  93.01  6373
               PP: precision:  96.31%; recall:  97.26%; FB1:  96.78  2465
              PRT: precision:  84.21%; recall:  71.11%; FB1:  77.11  38
             SBAR: precision:  91.12%; recall:  82.28%; FB1:  86.47  214
               VP: precision:  93.32%; recall:  92.80%; FB1:  93.06  2291
dev.out score: 92.5383

That is awesome! It seems like we got a huge boost in the accuracy. Lets try to alter it again and see what the output is. We will alter the probability to 3% now. 

In [None]:
def read_conll(handle, input_idx=0, label_idx=2):
    conll_data = []
    contents = re.sub(r'\n\s*\n', r'\n\n', handle.read())
    contents = contents.rstrip()
    for sent_string in contents.split('\n\n'):
        annotations = list(zip(*[ word_string.split() for word_string in sent_string.split('\n') ]))
        assert(input_idx < len(annotations))

        if label_idx < 0:
            conll_data.append( annotations[input_idx] )
            logging.info("CoNLL: {}".format( " ".join(annotations[input_idx])))
        else:
            assert(label_idx < len(annotations))

            noise_probability = 0.03 # This will be adjusted as needed
            noisy_annotations = []

            if model_in_training_mode():
                for word in annotations[input_idx]:
                    if random.random() < noise_probability:
                        # Randomly choose a noise function and apply it to the word
                        word = random.choice([substitute_character, omit_characters, phonetic_misspelling, 
                                          insert_random_character, keyboard_typo, swap_adjacent_characters])(word)
            
                    noisy_annotations.append(word)
                annotations = list(annotations) # Convert tuple to list to modify contents
                annotations[input_idx] = tuple(noisy_annotations) # Replace words with noisy words

            conll_data.append((annotations[input_idx], annotations[label_idx] ))
            logging.info("CoNLL: {} ||| {}".format( " ".join(annotations[input_idx]), " ".join(annotations[label_idx])))
    return conll_data

Our output is now: 
processed 23663 tokens with 11896 phrases; found: 11995 phrases; correct: 11091.
accuracy:  95.60%; (non-O)
accuracy:  95.80%; precision:  92.46%; recall:  93.23%; FB1:  92.85
             ADJP: precision:  70.98%; recall:  80.09%; FB1:  75.26  255
             ADVP: precision:  79.73%; recall:  74.12%; FB1:  76.82  370
            CONJP: precision:  55.56%; recall:  71.43%; FB1:  62.50  9
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  92.96%; recall:  94.18%; FB1:  93.56  6319
               PP: precision:  95.57%; recall:  97.21%; FB1:  96.39  2483
              PRT: precision:  74.47%; recall:  77.78%; FB1:  76.09  47
             SBAR: precision:  90.55%; recall:  76.79%; FB1:  83.11  201
               VP: precision:  92.86%; recall:  93.14%; FB1:  93.00  2311
dev.out score: 92.8467

We can see the F-score accuracy increased again, this indicates that our optimal probability is between 0.05% and 3%. The next part is to see, what learning rates will be optimal for the classification head. Lets begin with dropping the learning rate from 0.1 to 0.01, as it is possible that the learning rate is too high which would result in the loss function value increasing or sharply stop decreasing. 

In [None]:
def init_model_from_scratch(self, basemodel, tagset_size, lr):
        self.encoder = AutoModel.from_pretrained(basemodel)
        self.encoder_hidden_dim = self.encoder.config.hidden_size
        self.classification_head = nn.Linear(self.encoder_hidden_dim, tagset_size)
        # TODO initialize self.crf_layer in here as well.
        
        # TODO modify the optimizers in a way that each model part is optimized with a proper learning rate!
        encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = lr)
        head_optimizer = optim.SGD(self.classification_head.parameters(), lr = 0.01)

        self.optimizers = [encoder_optimizer, head_optimizer]



The following is our output:

processed 23663 tokens with 11896 phrases; found: 11981 phrases; correct: 11136.
accuracy:  95.74%; (non-O)
accuracy:  95.83%; precision:  92.95%; recall:  93.61%; FB1:  93.28
             ADJP: precision:  75.43%; recall:  77.43%; FB1:  76.42  232
             ADVP: precision:  77.41%; recall:  76.63%; FB1:  77.02  394
            CONJP: precision:  62.50%; recall:  71.43%; FB1:  66.67  8
             INTJ: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
               NP: precision:  93.71%; recall:  94.18%; FB1:  93.95  6268
               PP: precision:  95.59%; recall:  97.75%; FB1:  96.66  2496
              PRT: precision:  72.92%; recall:  77.78%; FB1:  75.27  48
             SBAR: precision:  88.39%; recall:  83.54%; FB1:  85.90  224
               VP: precision:  93.38%; recall:  93.62%; FB1:  93.50  2310
dev.out score: 93.2781

From what we can see, the F-score accuracy has significantly increased now which indicates that we either found the right learning rate or chose a low enough learning rate to decrease the loss function value over the epochs. 

Now, we will to alter the learning rate to 0.02, to test if 0.01 is a low learning rate or just right. 



In [None]:
def init_model_from_scratch(self, basemodel, tagset_size, lr):
        self.encoder = AutoModel.from_pretrained(basemodel)
        self.encoder_hidden_dim = self.encoder.config.hidden_size
        self.classification_head = nn.Linear(self.encoder_hidden_dim, tagset_size)
        # TODO initialize self.crf_layer in here as well.
        
        # TODO modify the optimizers in a way that each model part is optimized with a proper learning rate!
        encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = lr)
        head_optimizer = optim.SGD(self.classification_head.parameters(), lr = 0.02)

        self.optimizers = [encoder_optimizer, head_optimizer]



Our output is now:

processed 23663 tokens with 11896 phrases; found: 11969 phrases; correct: 11100.
accuracy:  95.66%; (non-O)
accuracy:  95.77%; precision:  92.74%; recall:  93.31%; FB1:  93.02
             ADJP: precision:  75.56%; recall:  75.22%; FB1:  75.39  225
             ADVP: precision:  75.87%; recall:  76.63%; FB1:  76.25  402
            CONJP: precision:  71.43%; recall:  71.43%; FB1:  71.43  7
             INTJ: precision:   0.00%; recall:   0.00%; FB1:   0.00  0
               NP: precision:  93.72%; recall:  94.34%; FB1:  94.03  6278
               PP: precision:  94.78%; recall:  97.42%; FB1:  96.08  2509
              PRT: precision:  71.43%; recall:  66.67%; FB1:  68.97  42
             SBAR: precision:  89.32%; recall:  77.64%; FB1:  83.07  206
               VP: precision:  93.22%; recall:  93.06%; FB1:  93.14  2300
dev.out score: 93.0233

We can observe that the F-score decreases slightly now which may indicate that 0.02 is too high of a learning rate. Now, lets try and decrease the learning rate to 0.009 to test if 0.01 is itself also a high learning rate. 


In [None]:
def init_model_from_scratch(self, basemodel, tagset_size, lr):
        self.encoder = AutoModel.from_pretrained(basemodel)
        self.encoder_hidden_dim = self.encoder.config.hidden_size
        self.classification_head = nn.Linear(self.encoder_hidden_dim, tagset_size)
        # TODO initialize self.crf_layer in here as well.
        
        # TODO modify the optimizers in a way that each model part is optimized with a proper learning rate!
        encoder_optimizer = optim.Adam(self.encoder.parameters(), lr = lr)
        head_optimizer = optim.SGD(self.classification_head.parameters(), lr = 0.009)

        self.optimizers = [encoder_optimizer, head_optimizer]

Our output is now:

processed 23663 tokens with 11896 phrases; found: 12037 phrases; correct: 11144.
accuracy:  95.67%; (non-O)
accuracy:  95.78%; precision:  92.58%; recall:  93.68%; FB1:  93.13
             ADJP: precision:  79.04%; recall:  80.09%; FB1:  79.56  229
             ADVP: precision:  73.61%; recall:  76.38%; FB1:  74.97  413
            CONJP: precision:  71.43%; recall:  71.43%; FB1:  71.43  7
             INTJ: precision: 100.00%; recall: 100.00%; FB1: 100.00  1
               NP: precision:  93.59%; recall:  93.92%; FB1:  93.76  6259
               PP: precision:  95.32%; recall:  97.58%; FB1:  96.44  2499
              PRT: precision:  72.92%; recall:  77.78%; FB1:  75.27  48
             SBAR: precision:  89.78%; recall:  85.23%; FB1:  87.45  225
               VP: precision:  92.36%; recall:  94.44%; FB1:  93.39  2356
dev.out score: 93.1266

We can see that 0.009 is a better learning rate than 0.02, although it is also not as good of a learning rate as 0.01. This further implies that 0.01 may be the right learning rate for this classification head and for the model. 

Therefore, the optimized values for learning rate of classification head is 0.01, and the optimized probability for simulating noise in the data is 3% given the trial and error process of testing different outputs. 