# Assignment 1 - Chen Yongquan [G2002341D]

## Environment

python           3.8.7  
nltk             3.5  
numpy            1.20.0  
scikit-learn     0.23.2  
scipy            1.6.0  
sklearn-crfsuite 0.3.6  
stanza           1.2  
torch            1.7.1+cu110  

CPU: Intel Core i7 3770K  
GPU: NVIDIA GeForce 1080 Ti  
OS:  Windows 10 x64

## Import raw WNUT17 corpus

In [1]:
def WNUT17Reader(file):
  corpus = []
  with open(file, 'r', encoding='utf-8') as reader:
    sentence  = []
    for line in reader:
      _ = line.strip()
      if _ != '':
        a, b = _.split('\t')
        sentence.append(tuple((a, b.split(',')[0])))
      else:
        corpus.append(sentence)
        sentence = []
  return corpus

def WNUT17Writer(file, corpus):
  with open(file, 'w', encoding='utf-8') as writer:
    for sentence in corpus:
      for word in sentence:
        writer.writelines('\t'.join(column for column in word) + '\n')
      writer.write('\n')

In [None]:
wnut17train = WNUT17Reader('wnut17train.conll')
wnut17train[0]

In [None]:
wnut17dev = WNUT17Reader('emerging.dev.conll')
wnut17dev[0]

In [None]:
wnut17test = WNUT17Reader('emerging.test.annotated')
wnut17test[0]

## POS tagging WNUT17 corpus

In [5]:
wnut17train_sents = [[word for word, iob in sentence] for sentence in wnut17train]
wnut17dev_sents = [[word for word, iob in sentence] for sentence in wnut17dev]
wnut17test_sents = [[word for word, iob in sentence] for sentence in wnut17test]

###  NLTK

In [6]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Home\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [7]:
wnut17train_tagged = nltk.pos_tag_sents(wnut17train_sents)
wnut17dev_tagged = nltk.pos_tag_sents(wnut17dev_sents)
wnut17test_tagged = nltk.pos_tag_sents(wnut17test_sents)

In [None]:
wnut17train_tagged_nltk = [
  [word + tuple([wnut17train[idx_sent][idx_word][1]]) for idx_word, word in enumerate(sentence)]
  for idx_sent, sentence in enumerate(wnut17train_tagged)
]
wnut17train_tagged_nltk[0]

In [None]:
wnut17dev_tagged_nltk = [
  [word + tuple([wnut17dev[idx_sent][idx_word][1]]) for idx_word, word in enumerate(sentence)]
  for idx_sent, sentence in enumerate(wnut17dev_tagged)
]
wnut17dev_tagged_nltk[0]

In [None]:
wnut17test_tagged_nltk = [
  [word + tuple([wnut17test[idx_sent][idx_word][1]]) for idx_word, word in enumerate(sentence)]
  for idx_sent, sentence in enumerate(wnut17test_tagged)
]
wnut17test_tagged_nltk[0]

In [11]:
WNUT17Writer('wnut17train.nltk.conll', wnut17train_tagged_nltk)
WNUT17Writer('emerging.dev.nltk.conll', wnut17dev_tagged_nltk)
WNUT17Writer('emerging.test.nltk.annotated', wnut17test_tagged_nltk)

### Stanza/Stanford CoreNLP

In [12]:
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.2.0.json: 128kB [00:00, 11.6MB/s]
2021-02-01 15:44:03 INFO: Downloading default packages for language: en (English)...
2021-02-01 15:44:04 INFO: File exists: C:\Users\Home\stanza_resources\en\default.zip.
2021-02-01 15:44:07 INFO: Finished downloading models and saved to C:\Users\Home\stanza_resources.


In [13]:
nlp = stanza.Pipeline(lang='en', processors='tokenize, pos', tokenize_pretokenized = True)
wnut17train_tagged = nlp(wnut17train_sents)
wnut17dev_tagged = nlp(wnut17dev_sents)
wnut17test_tagged = nlp(wnut17test_sents)

2021-02-01 15:44:07 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |

2021-02-01 15:44:07 INFO: Use device: gpu
2021-02-01 15:44:07 INFO: Loading: tokenize
2021-02-01 15:44:07 INFO: Loading: pos
2021-02-01 15:44:10 INFO: Done loading processors!


In [None]:
wnut17train_tagged_stanza = [
  [(word['text'], word['xpos'], wnut17train[idx_sent][idx_word][1]) for idx_word, word in enumerate(sentence)]
  for idx_sent, sentence in enumerate(wnut17train_tagged.to_dict())]
wnut17train_tagged_stanza[0]

In [None]:
wnut17dev_tagged_stanza = [
  [(word['text'], word['xpos'], wnut17dev[idx_sent][idx_word][1]) for idx_word, word in enumerate(sentence)]
  for idx_sent, sentence in enumerate(wnut17dev_tagged.to_dict())]
wnut17dev_tagged_stanza[0]

In [None]:
wnut17test_tagged_stanza = [
  [(word['text'], word['xpos'], wnut17test[idx_sent][idx_word][1]) for idx_word, word in enumerate(sentence)]
  for idx_sent, sentence in enumerate(wnut17test_tagged.to_dict())]
wnut17test_tagged_stanza[0]

In [17]:
WNUT17Writer('wnut17train.stanza.conll', wnut17train_tagged_stanza)
WNUT17Writer('emerging.dev.stanza.conll', wnut17dev_tagged_stanza)
WNUT17Writer('emerging.test.stanza.annotated', wnut17test_tagged_stanza)

# CRF Classifier

In [18]:
from nltk.corpus.reader import ConllChunkCorpusReader

In [19]:
wnut17 = ConllChunkCorpusReader(
  ".",
  r".*\.conll",
  ("person", "location", "corporation", "product", "creative-work", "group"),
)

## NLTK POS Tagged

In [20]:
train_sents_nltk = wnut17.iob_sents('wnut17train.nltk.conll')
dev_sents_nltk = wnut17.iob_sents('emerging.dev.nltk.conll')
test_sents_nltk = wnut17.iob_sents('emerging.test.nltk.annotated')

## Stanza POS Tagged

In [21]:
train_sents_stanza = wnut17.iob_sents('wnut17train.stanza.conll')
dev_sents_stanza = wnut17.iob_sents('emerging.dev.stanza.conll')
test_sents_stanza = wnut17.iob_sents('emerging.test.stanza.annotated')

## Feature Extraction

Basic Features:
1. Increased window size from 1 to 2
2. Original word  
3. Word stemming using NLTK SnowballStemmer  
4. Word shapes including whether it is all in lower case, upper case, camel case, digits, alphabets or a mix of digits and alphabets  
5. Suffix  
6. POS tag. Changed "postag[:2]" to "postag[0]" as the former is mostly same as "postag" since POS tags are mostly within 3 characters. The latter can focus on the general type of the word (noun, adjective, adverb) using the first character.  

Other tested features:
1. Removed bias feature after tuning as performance dropped with it. Theoretically, it should provide model with more expressibility by adding biases some state feature.  
2. Removed prefix feature as performance dropped. Seems reasonable as suffixes tends to provide more information on the word type (nouns [-man, -ry], inflected verbs [-ing], etc.).

In [None]:
from collections import OrderedDict
from nltk.stem.snowball import SnowballStemmer

parameters = OrderedDict()
parameters['lower'] = True # Boolean variable to control lowercasing of words
parameters['stem'] = True # Boolean variable to control stemming of words, overrides 'lower'

In [22]:
# functions of sentence representations for sequence labelling
def word2features(sent, i):

    word = sent[i][0]
    postag = sent[i][1]

    features = {
        #'bias': 1.0,
        'word': word,
        'word.stem()': SnowballStemmer("english").stem(word) if parameters['stem'] else word.lower(),
        'word.isupper()': word.islower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word.isalnum()': word.isalnum(),
        'word.isalpha()': word.isalpha(),
        'postag': postag,
        'postag[0]': postag[0],
    }

    if len(word) > 3:
#         features['word[:2]'] = word[:2]
        features['word[-2:]'] = word[-2:]
#         features['word[:3]'] = word[:3]
        features['word[-3:]'] = word[-3:]

    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word': word1,
            '-1:word.stem()': SnowballStemmer("english").stem(word1) if parameters['stem'] else word1.lower(),
            '-1:word.islower()': word1.islower(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalnum()': word1.isalnum(),
            '-1:word.isalpha()': word1.isalpha(),
            '-1:postag': postag1,
            '-1:postag[0]': postag1[0],
        })
        if len(word1) > 3:
#             features['-1:word[:2]'] = word1[:2]
            features['-1:word[-2:]'] = word1[-2:]
#             features['-1:word[:3]'] = word1[:3]
            features['-1:word[-3:]'] = word1[-3:]
        if i > 1:
            word1 = sent[i-2][0]
            postag1 = sent[i-2][1]
            features.update({
                '-2:word': word1,
                '-2:word.stem()': SnowballStemmer("english").stem(word1) if parameters['stem'] else word1.lower(),
                '-2:word.islower()': word1.islower(),
                '-2:word.isupper()': word1.isupper(),
                '-2:word.istitle()': word1.istitle(),
                '-2:word.isdigit()': word1.isdigit(),
                '-2:word.isalnum()': word1.isalnum(),
                '-2:word.isalpha()': word1.isalpha(),
                '-2:postag': postag1,
                '-2:postag[0]': postag1[0],
            })
            if len(word1) > 3:
#                 features['-2:word[:2]'] = word1[:2]
                features['-2:word[-2:]'] = word1[-2:]
#                 features['-2:word[:3]'] = word1[:3]
                features['-2:word[-3:]'] = word1[-3:]
    else:
        # Indicate that it is the 'beginning of a document'
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word': word1,
            '+1:word.stem()': SnowballStemmer("english").stem(word1) if parameters['stem'] else word1.lower(),
            '+1:word.islower()': word1.islower(),
            '+1:word.isupper()': word1.isupper(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isdigit()': word1.isdigit(),
            '+1:word.isalnum()': word1.isalnum(),
            '+1:word.isalpha()': word1.isalpha(),
            '+1:postag': postag1,
            '+1:postag[0]': postag1[0],
        })
        if len(word1) > 3:
#             features['+1:word[:2]'] = word1[:2]
            features['+1:word[-2:]'] = word1[-2:]
#             features['+1:word[:3]'] = word1[:3]
            features['+1:word[-3:]'] = word1[-3:]
        if i < len(sent)-2:
            word1 = sent[i+2][0]
            postag1 = sent[i+2][1]
            features.update({
                '+2:word': word1,
                '+2:word.stem()': SnowballStemmer("english").stem(word1) if parameters['stem'] else word1.lower(),
                '+2:word.islower()': word1.islower(),
                '+2:word.isupper()': word1.isupper(),
                '+2:word.istitle()': word1.istitle(),
                '+2:word.isdigit()': word1.isdigit(),
                '+2:word.isalnum()': word1.isalnum(),
                '+2:word.isalpha()': word1.isalpha(),
                '+2:postag': postag1,
                '+2:postag[0]': postag1[0],
            })
            if len(word1) > 3:
#                 features['+2:word[:2]'] = word1[:2]
                features['+2:word[-2:]'] = word1[-2:]
#                 features['+2:word[:3]'] = word1[:3]
                features['+2:word[-3:]'] = word1[-3:]
    else:
        # Features for words that are not at the end of a document
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

#### Sample Features

In [None]:
sample_sentence = " ".join([a for a,b,c in train_sents_nltk[0]])
print(sample_sentence)
print('')
display(word2features(train_sents_nltk[0], 2))
print('')
display(sent2features(train_sents_nltk[0]))

## Dataset Preparation

### NLTK

In [24]:
# sentence representations for sequence labelling
X_train_nltk = [sent2features(s) for s in train_sents_nltk]
y_train_nltk = [sent2labels(s) for s in train_sents_nltk]

X_dev_nltk = [sent2features(s) for s in dev_sents_nltk]
y_dev_nltk = [sent2labels(s) for s in dev_sents_nltk]

X_test_nltk = [sent2features(s) for s in test_sents_nltk]
y_test_nltk = [sent2labels(s) for s in test_sents_nltk]

In [None]:
X_train_nltk[0],y_train_nltk[0]

In [None]:
X_dev_nltk[0],y_dev_nltk[0]

In [None]:
X_test_nltk[0],y_test_nltk[0]

### Stanza

In [28]:
# sentence representations for sequence labelling
X_train_stanza = [sent2features(s) for s in train_sents_stanza]
y_train_stanza = [sent2labels(s) for s in train_sents_stanza]

X_dev_stanza = [sent2features(s) for s in dev_sents_stanza]
y_dev_stanza = [sent2labels(s) for s in dev_sents_stanza]

X_test_stanza = [sent2features(s) for s in test_sents_stanza]
y_test_stanza = [sent2labels(s) for s in test_sents_stanza]

In [None]:
X_train_stanza[0],y_train_stanza[0]

In [None]:
X_dev_stanza[0],y_dev_stanza[0]

In [None]:
X_test_stanza[0],y_test_stanza[0]

## Training

In [32]:
import sklearn_crfsuite

In [33]:
# train CRF model
crf_nltk = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [34]:
# train CRF model
crf_stanza = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

In [35]:
crf_nltk.fit(X_train_nltk, y_train_nltk)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [36]:
crf_stanza.fit(X_train_stanza, y_train_stanza)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Evaluation

In [37]:
from sklearn_crfsuite import metrics

In [38]:
# get label set
labels = list(crf_nltk.classes_)
labels.remove('O')
print(labels)

['B-location', 'I-location', 'B-group', 'B-corporation', 'B-person', 'B-creative-work', 'B-product', 'I-person', 'I-creative-work', 'I-corporation', 'I-group', 'I-product']


### NLTK

In [39]:
# evaluate CRF model
y_train_nltk_pred = crf_nltk.predict(X_train_nltk)
print("Train F1 Score:")
print(metrics.flat_f1_score(y_train_nltk, y_train_nltk_pred, average='weighted', labels=labels))
print("")

y_dev_nltk_pred = crf_nltk.predict(X_dev_nltk)
print("Dev F1 Score:")
print(metrics.flat_f1_score(y_dev_nltk, y_dev_nltk_pred, average='weighted', labels=labels))
print("")

y_test_nltk_pred = crf_nltk.predict(X_test_nltk)
print("Test F1 Score:")
print(metrics.flat_f1_score(y_test_nltk, y_test_nltk_pred, average='weighted', labels=labels))
print("")

Train F1 Score:
0.9937942594137114

Dev F1 Score:
0.1737323793913934

Test F1 Score:
0.1528430312503639



In [40]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test_nltk, y_test_nltk_pred, labels=sorted_labels, digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))


                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.333     0.035     0.064       142
I-creative-work      0.206     0.032     0.056       218
        B-group      0.529     0.055     0.099       165
        I-group      0.471     0.114     0.184        70
     B-location      0.389     0.247     0.302       150
     I-location      0.353     0.128     0.187        94
       B-person      0.534     0.145     0.228       429
       I-person      0.460     0.221     0.299       131
      B-product      0.143     0.008     0.015       127
      I-product      0.250     0.071     0.111       126

      micro avg      0.411     0.103     0.165      1740
      macro avg      0.306     0.088     0.129      1740
   weighted avg      0.370     0.103     0.153      1740



### Stanza

In [42]:
# evaluate CRF model
y_train_stanza_pred = crf_stanza.predict(X_train_stanza)
print("Train F1 Score:")
print(metrics.flat_f1_score(y_train_stanza, y_train_stanza_pred, average='weighted', labels=labels))
print("")

y_dev_stanza_pred = crf_stanza.predict(X_dev_stanza)
print("Dev F1 Score:")
print(metrics.flat_f1_score(y_dev_stanza, y_dev_stanza_pred, average='weighted', labels=labels))
print("")

y_test_stanza_pred = crf_stanza.predict(X_test_stanza)
print("Test F1 Score:")
print(metrics.flat_f1_score(y_test_stanza, y_test_stanza_pred, average='weighted', labels=labels))
print("")

Train F1 Score:
0.9939881221190546

Dev F1 Score:
0.26772025537374367

Test F1 Score:
0.1829594225374353



In [43]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test_stanza, y_test_stanza_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.292     0.049     0.084       142
I-creative-work      0.233     0.046     0.077       218
        B-group      0.357     0.061     0.104       165
        I-group      0.462     0.086     0.145        70
     B-location      0.418     0.253     0.315       150
     I-location      0.448     0.138     0.211        94
       B-person      0.546     0.235     0.329       429
       I-person      0.500     0.298     0.373       131
      B-product      0.000     0.000     0.000       127
      I-product      0.167     0.024     0.042       126

      micro avg      0.437     0.130     0.201      1740
      macro avg      0.285     0.099     0.140      1740
   weighted avg      0.350     0.130     0.183      1740



### Top Features for Stanza tagged corpus

#### Comments

1. We can see that the model has learnt to recognize corporation names in the training data like Twitter and Facebook.  
   However, such memorization is useless when posed with new corporation names in the test and dev dataset where neither Twitter nor Facebook were encountered, resulting in a dismal F1-score of 0.  
   It seems it is hard to recognize corporation names by transition features too, as the model failed to learn any meaningful transition features to aid in this task.  

   ***5.244764 B-corporation word.stem():twitter***  
   ***3.408561 B-corporation word.stem():facebook***

2. Word stemming seems to provide marginally better results than just standardizing words to lowercases. F1 score improved in training and validation set, but dropped in the test set.  
   Probably because the original model does not feature much noise removal from the dataset and inflections of the same word are represented as different state features.  
   However, stemming also has a risk of increasing false positives and worsening precision as there are slight chances that different words may be stemmed to resemble a named entity.  
   For instance, "universal" and "university" both stems to "univers" though the latter is a location while the former is not.  

3. Interestingly, the model seems to have learn that the entities in WNUT17 have a low chance of appearing in the beginning or end of the sentence.  

   ***5.736417 O        EOS***  
   ***5.505679 O        BOS***

4. Model has learnt that adverbs, verbs, pronouns, wh-determiners, wh-pronouns, wh-adverbs, have a low chance of being any of the entity classes based on the first character of the POS tag.  

   ***2.771931 O        postag[0]:P***  
   ***2.640906 O        postag[0]:W***  
   ***2.590986 O        postag[0]:R***  
   ***2.547664 O        postag[0]:V***

5. Surprisingly, word[-3:]:ing nor postag:VBG is not within the top 100 positive and negative state features. This may be because they have been accounted for in postag[0]:V and the model did not learn to recognize these other similar features as non-entities.

In [44]:
from collections import Counter
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf_stanza.state_features_).most_common(100))

print("\nTop negative:")
print_state_features(Counter(crf_stanza.state_features_).most_common()[-100:])

Top positive:
5.736417 O        EOS
5.505679 O        BOS
5.244764 B-corporation word.stem():twitter
3.954554 O        word[-3:]:day
3.408561 B-corporation word.stem():facebook
2.971273 B-product word.stem():iphon
2.771931 O        postag[0]:P
2.673593 B-group  -1:word.stem():vs
2.640906 O        postag[0]:W
2.590986 O        postag[0]:R
2.547664 O        postag[0]:V
2.508714 O        word.stem():rt
2.406397 O        postag:CC
2.384599 B-corporation word:twitter
2.340121 O        word:RT
2.338465 B-group  +1:word.stem():vs
2.080372 B-location word.stem():uk
2.028535 B-group  postag:NNPS
2.012369 B-person word.stem():pope
2.007176 O        postag:NFP
1.997899 B-product word:iPhone
1.960324 B-creative-work -1:word.stem():watch
1.895639 O        postag:IN
1.895639 O        postag[0]:I
1.849964 O        word[-3:]:ary
1.842711 B-person word.stem():taylor
1.824931 O        word.stem():day
1.818309 B-person word.stem():sam
1.811222 B-person word.stem():justin
1.795448 B-group  word.isupper()


## Hyperparameter Optimization

Using Stanza tagged dataset since it provides better F1 score.

In [45]:
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV

# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train_stanza, y_train_stanza)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 13.6min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000015B72518AC0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000015B7156EB50>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-location', 'I-location', 'B-group', 'B-corporation', 'B-person', 'B-creative-work', 'B-product', 'I-person', 'I-creative-work', 'I-corporation', 'I-group', 'I-product']),
                   verbose=1)

In [46]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

best params: {'c1': 0.16442900131755955, 'c2': 0.1517231507430664}
best CV score: 0.42046694783673244
model size: 1.78M


In [47]:
crf = rs.best_estimator_
y_train_stanza_pred = crf.predict(X_train_stanza)
print("Train F1 Score:")
print(metrics.flat_f1_score(y_train_stanza, y_train_stanza_pred, average='weighted', labels=labels))
print("")

y_dev_stanza_pred = crf.predict(X_dev_stanza)
print("Dev F1 Score:")
print(metrics.flat_f1_score(y_dev_stanza, y_dev_stanza_pred, average='weighted', labels=labels))
print("")

y_test_stanza_pred = crf.predict(X_test_stanza)
print("Test F1 Score:")
print(metrics.flat_f1_score(y_test_stanza, y_test_stanza_pred, average='weighted', labels=labels))
print("")

Train F1 Score:
0.9928338125897982

Dev F1 Score:
0.2891223909259757

Test F1 Score:
0.19060749452674997



#### Comments

An improvement of ~0.02 on dev set and ~0.01 on test set after automated hyperparameter tuning.  
For comparison, scores before were:

Train F1 Score:  
0.9939881221190546

Dev F1 Score:  
0.26772025537374367

Test F1 Score:  
0.182959422

In [48]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test_stanza, y_test_stanza_pred, labels=sorted_labels, digits=3
))

                 precision    recall  f1-score   support

  B-corporation      0.000     0.000     0.000        66
  I-corporation      0.000     0.000     0.000        22
B-creative-work      0.304     0.049     0.085       142
I-creative-work      0.244     0.046     0.077       218
        B-group      0.423     0.067     0.115       165
        I-group      0.538     0.100     0.169        70
     B-location      0.408     0.267     0.323       150
     I-location      0.448     0.138     0.211        94
       B-person      0.556     0.245     0.340       429
       I-person      0.506     0.328     0.398       131
      B-product      0.000     0.000     0.000       127
      I-product      0.600     0.024     0.046       126

      micro avg      0.462     0.137     0.212      1740
      macro avg      0.336     0.105     0.147      1740
   weighted avg      0.395     0.137     0.191      1740



# Softmax Classifier

Code for loading checkpoints is at the bottom.

In [49]:
import re

parameters['stem'] = False # Boolean variable to control stemming of words, turned off for softmax classifier as performance is worse

## Build Vocabulary and Label Dictionary

In [51]:
def lower_case(x,lower=False,stem=False):
    if stem:
        return SnowballStemmer("english").stem(x)
    elif lower:
        return x.lower()
    else:
        return x

def create_dico(item_list):
    """
    Create a dictionary of items from a list of list of items.
    """
    assert type(item_list) is list
    dico = {}
    for items in item_list:
        for item in items:
            if item not in dico:
                dico[item] = 1
            else:
                dico[item] += 1
    return dico

def create_mapping(dico):
    """
    Create a mapping (item to ID / ID to item) from a dictionary.
    Items are ordered by decreasing frequency.
    """
    sorted_items = sorted(dico.items(), key=lambda x: (-x[1], x[0]))
    id_to_item = {i: v[0] for i, v in enumerate(sorted_items)}
    item_to_id = {v: k for k, v in id_to_item.items()}
    return item_to_id, id_to_item

def word_mapping(sentences, lower, stem):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[lower_case(token, lower, stem) if (lower or stem) else token for token, pos, iob in s] for s in sentences]
    dico = create_dico(words)
    dico['<PAD>'] = 10000001 #UNK tag for unknown words
    dico['<UNK>'] = 10000000 #UNK tag for unknown words
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))
    return dico, word_to_id, id_to_word

def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[iob for token, pos, iob in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag

In [52]:
dico_words,word_to_id,id_to_word = word_mapping(train_sents_stanza, parameters['lower'], parameters['stem'])
dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sents_stanza)

Found 12842 unique words (62730 in total)
Found 13 unique named entity tags


#### Comments

We remove 'O' from the set of labels used for metrics computation in order to achieve parity with the evaluation for the CRF classifier.
Also, we are not interested in the accuracy of the model at predicting the 'O' class and in an unbalanced dataset where 'O' class words outnumber other classes,
including it in the label set would cause the high "accuracy" of predicting the 'O' class of an poor model to overwhelm the low accuracy for the actual informational classes,
resulting in a high weighted average F1-score.

In [None]:
metric_labels = list(tag_to_id.values())
metric_labels.remove(tag_to_id['O'])

### Numericalize raw corpus

In [53]:
def prepare_dataset(sentences, word_to_id, tag_to_id, lower=False, stem=False):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """
    data = []
    for s in sentences:
        str_words = [w[0] for w in s]
        words = [word_to_id[lower_case(w,lower,stem) if lower_case(w,lower,stem) in word_to_id else '<UNK>']
                 for w in str_words]
        tags = [tag_to_id[w[-1]] for w in s]
        data.append({
            'str_words': str_words,
            'words': words,
            'tags': tags,
        })
    return data

In [54]:
train_data = prepare_dataset(
    train_sents_stanza, word_to_id, tag_to_id, parameters['lower'], parameters['stem']
)
dev_data = prepare_dataset(
    dev_sents_stanza, word_to_id, tag_to_id, parameters['lower'], parameters['stem']
)
test_data = prepare_dataset(
    test_sents_stanza, word_to_id, tag_to_id, parameters['lower'], parameters['stem']
)
print("{} / {} / {} sentences in train / dev / test.".format(len(train_data), len(dev_data), len(test_data)))

3394 / 1009 / 1287 sentences in train / dev / test.


### Torch Dataloader

In [55]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from functools import partial
from sklearn_crfsuite import metrics
from IPython.display import clear_output

dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [56]:
def pad_sentence_for_window(sentence, window_size, pad_token=0):
    return [pad_token]*window_size + sentence + [pad_token]*window_size

def my_collate(data, window_size, word_to_id):
    """
    For some chunk of sentences and labels
        -add winow padding
        -pad for lengths using pad_sequence
        -convert our labels to one-hots
        -return padded inputs, one-hot labels, and lengths
    """

    # deal with input sentences as we've seen
    window_padded = [pad_sentence_for_window(sentence['words'], window_size, word_to_id['<PAD>']) for sentence in data]

    # append zeros to each list of token ids in batch so that they are all the same length
    padded = nn.utils.rnn.pad_sequence([torch.LongTensor(t) for t in window_padded], batch_first=True)

    # convert labels to one-hots
    labels = []
    lengths = []
    for sentence in data:
        lengths.append(len(sentence['tags']))
        label = torch.zeros([len(sentence['tags']), len(id_to_tag)])
        label[torch.arange(len(sentence['tags'])),sentence['tags']] = 1
        labels.append(label)
    padded_labels = nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return padded.long(), padded_labels, torch.LongTensor(lengths)

## Model

In [57]:
class SoftmaxWordWindowClassifier(nn.Module):
    """
    A one-layer, binary word-window classifier.
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.hidden_depth = config["hidden_depth"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        self.dropout = config["dropout"]
        self.sigmoid = config["sigmoid"]

        """
        Embedding layer
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False

        """
        Hidden layer
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        hidden_layers = []
        hidden_layers.append(
            nn.Sequential(
                nn.Linear(self.window_size*self.embed_dim, self.hidden_dim),
                nn.Sigmoid() if self.sigmoid else nn.Tanh()
            )
        )
        if self.dropout:
              hidden_layers.append(
                  nn.Dropout(p=self.dropout)
              )
        for _ in range(1, self.hidden_depth):
            hidden_layers.append(
                nn.Sequential(
                    nn.Linear(self.hidden_dim, self.hidden_dim),
                    nn.Sigmoid() if self.sigmoid else nn.Tanh()
                )
            )
            if self.dropout:
                hidden_layers.append(
                    nn.Dropout(p=self.dropout)
            )
        self.hidden_layer = nn.Sequential(*hidden_layers)

        """
        Output layer
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)

        """
        Softmax
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)

    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim

        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()

        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()

        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)

        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S)
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)

        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)

        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)

        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)

        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)

        return output

## Training

In [58]:
def loss_function(outputs, labels, lengths):
    """Computes negative LL loss on a batch of model predictions."""
    B, L, num_classes = outputs.size()
    num_elems = lengths.sum().float()

    # get only the values with non-zero labels
    loss = outputs*labels

    # rescale average
    return -loss.sum() / num_elems

In [59]:
def train_epoch(loss_function, optimizer, model, train_data):
    model.train()
    ## For each batch, we must reset the gradients
    ## stored by the model.
    total_loss = 0
    f1_scores = []
    for batch, labels, lengths in train_data:
        # clear gradients
        optimizer.zero_grad()
        # evoke model in training mode on batch
        outputs = model.forward(batch.to(dev, non_blocking = True))
        # compute loss w.r.t batch
        loss = loss_function(outputs, labels.to(dev, non_blocking = True), lengths.to(dev, non_blocking = True))
        # pass gradients back, startiing on loss value
        loss.backward()
        # update parameters
        optimizer.step()
        total_loss += loss.item()
        pred = torch.argmax(outputs.detach() * torch.sum(labels.to(dev, non_blocking = True), dim = 2).unsqueeze(2), dim=2).cpu().numpy()
        tgts = torch.argmax(labels, dim=2).cpu().numpy()
        f1_scores.append(metrics.flat_f1_score(tgts, pred, average='weighted', labels=metric_labels))
    # return the total to keep track of how you did this time around
    return total_loss, f1_scores

def validate(model, val_data):
    model.eval()
    total_loss = 0
    f1_scores = []
    with torch.no_grad():
        for batch, labels, lengths in val_data:
            outputs = model.forward(batch.to(dev, non_blocking = True))
            loss = loss_function(outputs, labels.to(dev, non_blocking = True), lengths.to(dev, non_blocking = True))
            total_loss += loss.item()

            pred = torch.argmax(outputs.detach() * torch.sum(labels.to(dev, non_blocking = True), dim = 2).unsqueeze(2), dim=2).cpu().numpy()
            tgts = torch.argmax(labels, dim=2).cpu().numpy()
            f1_scores.append(metrics.flat_f1_score(tgts, pred, average='weighted', labels=metric_labels))
    return total_loss, f1_scores

In [60]:
config = {"half_window": 2,
          "embed_dim": 25,
          "hidden_dim": 25,
          "hidden_depth": 3,
          "num_classes": len(tag_to_id),
          "freeze_embeddings": False,
          "dropout": False, # False/Probability[0.0-1.0]
          "sigmoid": True,
          "batch_size": 20,
          "learning_rate": 10.0,
          "num_epochs": 500
         }
model = SoftmaxWordWindowClassifier(config, len(word_to_id)).to(dev, non_blocking = True)

In [61]:
learning_rate = config['learning_rate']
num_epochs = config['num_epochs']
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience = 5, factor = 0.5, threshold = 0.00001, min_lr = 1.0, mode = 'max')
# scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0 = 100, T_mult = 1, eta_min = 1.0 )
# criterion = nn.CrossEntropyLoss()

In [None]:
train_loader = DataLoader(train_data,
                          batch_size=config["batch_size"],
                          shuffle=True,
                          collate_fn=partial(my_collate, window_size=config["half_window"], word_to_id=word_to_id),
                          pin_memory=True)

dev_loader = DataLoader(dev_data,
                        batch_size=len(dev_data),
                        shuffle=False,
                        collate_fn=partial(my_collate, window_size=config["half_window"], word_to_id=word_to_id),
                        pin_memory = True)

In [None]:
losses = []
f1_scores = []
val_losses = []
val_f1_scores = []
for epoch in range(num_epochs):
    epoch_loss, f1_score = train_epoch(loss_function, optimizer, model, train_loader)
    if epoch % 10 == 0 or epoch == num_epochs - 1:
        losses.append(epoch_loss)
        f1_scores.append(np.mean(f1_score))
        val_loss, val_f1_score = validate(model, dev_loader)
        val_losses.append(val_loss)
        val_f1_score = np.mean(val_f1_score)
        val_f1_scores.append(val_f1_score)
        scheduler.step(val_f1_score) # ReduceLROnPlateau
        checkpoint = {
          "config" : config,
          "model": model.state_dict(),
          "scheduler": scheduler.state_dict(),
          "losses": losses,
          "f1_scores": f1_scores,
          "val_losses": val_losses,
          "val_f1_scores": val_f1_scores
        }
        torch.save(checkpoint, "./checkpoints/epoch_" + str(epoch) + ".pt")
    # scheduler.step(epoch) # CosineAnnealingWarmRestarts
    clear_output(wait = True)
    print("Epoch " + str(epoch + 1))
    print(scheduler.state_dict())
    print("\nLoss: " + str(epoch_loss))
    print("\nMean F1 Score: " + str(np.mean(f1_score)))
    print("\n{:25s}|{:25s}|{:25s}|{:25s}".format("Training Losses", "Validation Losses", "Training F1 Scores", "Validation F1 Scores"))
    for i in range(len(losses)):
      print("{:<25.15f}|{:<25.15f}|{:<25.15f}|{:<25.15f}".format(losses[i], val_losses[i], f1_scores[i], val_f1_scores[i]))

## Evaluation

In [None]:
train_loader = DataLoader(train_data,
                          batch_size=len(train_data),
                          shuffle=False,
                          collate_fn=partial(my_collate, window_size=2, word_to_id=word_to_id),
                          pin_memory = True)

In [None]:
model.eval()
with torch.no_grad():
  for batch, labels, lengths in train_loader:
      outputs = model.forward(batch.to(dev, non_blocking = True))
      outputs *= torch.sum(labels.to(dev, non_blocking = True), dim = 2).unsqueeze(2)
      pred = torch.argmax(outputs, dim=2)
      tgts = torch.argmax(labels, dim=2)
      print("Flat F1 Score: " + str(metrics.flat_f1_score(tgts.cpu().numpy(), pred.cpu().numpy(), average='weighted', labels=metric_labels)))

In [None]:
dev_loader = DataLoader(dev_data,
                        batch_size=len(dev_data),
                        shuffle=False,
                        collate_fn=partial(my_collate, window_size=2, word_to_id=word_to_id),
                        pin_memory = True)

In [None]:
model.eval()
with torch.no_grad():
  for batch, labels, lengths in dev_loader:
      outputs = model.forward(batch.to(dev, non_blocking = True))
      outputs *= torch.sum(labels.to(dev, non_blocking = True), dim = 2).unsqueeze(2)
      pred = torch.argmax(outputs, dim=2)
      tgts = torch.argmax(labels, dim=2)
      print("Flat F1 Score: " + str(metrics.flat_f1_score(tgts.cpu().numpy(), pred.cpu().numpy(), average='weighted', labels=metric_labels)))

In [None]:
test_loader = DataLoader(test_data,
                         batch_size=len(test_data),
                         shuffle=False,
                         collate_fn=partial(my_collate, window_size=2, word_to_id=word_to_id),
                         pin_memory = True)

In [None]:
model.eval()
with torch.no_grad():
  for batch, labels, lengths in test_loader:
      outputs = model.forward(batch.to(dev, non_blocking = True))
      outputs *= torch.sum(labels.to(dev, non_blocking = True), dim = 2).unsqueeze(2)
      pred = torch.argmax(outputs, dim=2)
      tgts = torch.argmax(labels, dim=2)
      print("Flat F1 Score: " + str(metrics.flat_f1_score(tgts.cpu().numpy(), pred.cpu().numpy(), average='weighted', labels=metric_labels)))

## Prediction Demo

In [None]:
sentence1 = "@paulwalk It 's the view from where I 'm living for two weeks . Empire State Building = ESB . Pretty bad storm here last evening .".split(" ")
sentence2 = "Engadget 's Stephen Fry is at top of the Empire State Building".split(" ")
sentence3 = "#StarWars #TheCloneWars is rescreening in theaters in Alderwood now".split(" ")
sent_id1 = [word_to_id[lower_case(w,parameters['lower'],parameters['stem']) if lower_case(w,parameters['lower'],parameters['stem']) in word_to_id else '<UNK>'] for w in sentence1]
sent_id2 = [word_to_id[lower_case(w,parameters['lower'],parameters['stem']) if lower_case(w,parameters['lower'],parameters['stem']) in word_to_id else '<UNK>'] for w in sentence2]
sent_id3 = [word_to_id[lower_case(w,parameters['lower'],parameters['stem']) if lower_case(w,parameters['lower'],parameters['stem']) in word_to_id else '<UNK>'] for w in sentence3]
demo_data = [
  {
    "words": sent_id1,
    "tags" : [0] * len(sent_id1)
  },
  {
    "words": sent_id2,
    "tags" : [0] * len(sent_id2)
  },
  {
    "words": sent_id3,
    "tags" : [0] * len(sent_id3)
  }
]

In [None]:
demo_loader = DataLoader(demo_data,
                         batch_size=len(demo_data),
                         shuffle=False,
                         collate_fn=partial(my_collate, window_size=2, word_to_id=word_to_id),
                         pin_memory = True)

In [None]:
with torch.no_grad():
  model.eval()
  for batch, labels, lengths in demo_loader:
      outputs = model.forward(batch.to(dev, non_blocking = True))
      # use generated mask instead of labels in case of test data where labels are not available
      mask = torch.zeros([outputs.shape[0],outputs.shape[1]]).to(dev, non_blocking = True)
      for i in range(len(lengths)):
        mask[i,0:lengths[i]] = 1
      mask = mask.unsqueeze(2)
      outputs *= mask
      # outputs *= torch.sum(labels.to(dev, non_blocking = True), dim = 2).unsqueeze(2)
      pred = torch.argmax(outputs, dim=2)
      tgts = torch.argmax(labels, dim=2)
      pred_tag = [[id_to_tag[id_] for id_ in sent] for sent in pred.cpu().numpy()]
      tgts_tag = [[id_to_tag[id_] for id_ in sent] for sent in tgts.cpu().numpy()]
      for i, j in enumerate([*zip(pred_tag, tgts_tag)]):
        print("Sentence {}:".format(i+1))
        print(re.sub("<PAD>", "", " ".join([id_to_word[id_] for id_ in batch[i].numpy()])).strip())
        print("Prediction: ")
        print(j[0][:lengths[i]])
        print("")

## Load Model

In [None]:
checkpoint = torch.load("./checkpoints/epoch_499.pt")
config = checkpoint['config']
losses = checkpoint['losses']
f1_scores = checkpoint['f1_scores']
val_losses = checkpoint['val_losses']
val_f1_scores = checkpoint['val_f1_scores']

In [None]:
model.load_state_dict(checkpoint['model'])
scheduler.load_state_dict(checkpoint['scheduler'])

## Residual Learning [Experimental]

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self, hidden_dim, sigmoid):
        super(ResidualBlock, self).__init__()
        self.ll = nn.Linear(hidden_dim, hidden_dim)
        self.sigmoid = nn.Sigmoid() if sigmoid else nn.Tanh()

    def forward(self, x):
        identity = x
        out = self.ll(x)
        out += identity
        out = self.sigmoid(out)
        return out

class SoftmaxWordWindowResidualClassifier(nn.Module):
    """
    A one-layer, binary word-window classifier.
    """
    def __init__(self, config, vocab_size, pad_idx=0):
        super(SoftmaxWordWindowResidualClassifier, self).__init__()
        """
        Instance variables.
        """
        self.window_size = 2*config["half_window"]+1
        self.embed_dim = config["embed_dim"]
        self.hidden_dim = config["hidden_dim"]
        self.hidden_depth = config["hidden_depth"]
        self.num_classes = config["num_classes"]
        self.freeze_embeddings = config["freeze_embeddings"]
        self.dropout = config["dropout"]
        self.sigmoid = config["sigmoid"]

        """
        Embedding layer
        -model holds an embedding for each layer in our vocab
        -sets aside a special index in the embedding matrix for padding vector (of zeros)
        -by default, embeddings are parameters (so gradients pass through them)
        """
        self.embed_layer = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_idx)
        if self.freeze_embeddings:
            self.embed_layer.weight.requires_grad = False

        """
        Hidden layer
        -we want to map embedded word windows of dim (window_size+1)*self.embed_dim to a hidden layer.
        -nn.Sequential allows you to efficiently specify sequentially structured models
            -first the linear transformation is evoked on the embedded word windows
            -next the nonlinear transformation tanh is evoked.
        """
        hidden_layers = []

        hidden_layers.append(
            nn.Sequential(
                nn.Linear(self.window_size*self.embed_dim, self.hidden_dim),
                nn.Sigmoid() if self.sigmoid else nn.Tanh()
            )
        )
        if self.dropout:
              hidden_layers.append(
                  nn.Dropout(p=self.dropout)
              )
        for _ in range(1, self.hidden_depth):
            hidden_layers.append(
                ResidualBlock(self.hidden_dim, self.sigmoid)
            )
            if self.dropout:
                hidden_layers.append(
                    nn.Dropout(p=self.dropout)
            )
        self.hidden_layer = nn.Sequential(*hidden_layers)

        """
        Output layer
        -we want to map elements of the output layer (of size self.hidden dim) to a number of classes.
        """
        self.output_layer = nn.Linear(self.hidden_dim, self.num_classes)

        """
        Softmax
        -The final step of the softmax classifier: mapping final hidden layer to class scores.
        -pytorch has both logsoftmax and softmax functions (and many others)
        -since our loss is the negative LOG likelihood, we use logsoftmax
        -technically you can take the softmax, and take the log but PyTorch's implementation
         is optimized to avoid numerical underflow issues.
        """
        self.log_softmax = nn.LogSoftmax(dim=2)

    def forward(self, inputs):
        """
        Let B:= batch_size
            L:= window-padded sentence length
            D:= self.embed_dim
            S:= self.window_size
            H:= self.hidden_dim

        inputs: a (B, L) tensor of token indices
        """
        B, L = inputs.size()

        """
        Reshaping.
        Takes in a (B, L) LongTensor
        Outputs a (B, L~, S) LongTensor
        """
        # Fist, get our word windows for each word in our input.
        token_windows = inputs.unfold(1, self.window_size, 1)
        _, adjusted_length, _ = token_windows.size()

        # Good idea to do internal tensor-size sanity checks, at the least in comments!
        assert token_windows.size() == (B, adjusted_length, self.window_size)

        """
        Embedding.
        Takes in a torch.LongTensor of size (B, L~, S)
        Outputs a (B, L~, S, D) FloatTensor.
        """
        embedded_windows = self.embed_layer(token_windows)

        """
        Reshaping.
        Takes in a (B, L~, S, D) FloatTensor.
        Resizes it into a (B, L~, S*D) FloatTensor.
        -1 argument "infers" what the last dimension should be based on leftover axes.
        """
        embedded_windows = embedded_windows.view(B, adjusted_length, -1)

        """
        Layer 1.
        Takes in a (B, L~, S*D) FloatTensor.
        Resizes it into a (B, L~, H) FloatTensor
        """
        layer_1 = self.hidden_layer(embedded_windows)

        """
        Layer 2
        Takes in a (B, L~, H) FloatTensor.
        Resizes it into a (B, L~, 2) FloatTensor.
        """
        output = self.output_layer(layer_1)

        """
        Softmax.
        Takes in a (B, L~, 2) FloatTensor of unnormalized class scores.
        Outputs a (B, L~, 2) FloatTensor of (log-)normalized class scores.
        """
        output = self.log_softmax(output)

        return output