<div style="text-align: right"><a href="http://ml-school.uni-koeln.de">Virtual Summer School "Deep Learning for
    Language Analysis"</a> <br/><strong>Text Analysis with Deep Learning</strong><br/>Aug 31 — Sep 4, 2020<br/>Nils Reiter<br/><a href="mailto:nils.reiter@uni-koeln.de">nils.reiter@uni-koeln.de</a></div>

# Exercise 3

task description

In [1]:
! if ! [[ -f data/ner/gmb.csv ]]; then curl https://nilsreiter.de/assets/2020-08-31-deep-learning/ner/gmb.csv > data/ner/gmb.csv; fi

In [2]:
import pandas as pd

data = pd.read_csv("data/ner/gmb.csv",encoding = 'latin1')
data = data.fillna(method = 'ffill')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [3]:
def get_sentences(data):
    n_sent=1
    agg_func = lambda s:[(w,p,t) for w,p,t in zip(s["Word"].values.tolist(),
                                                     s["POS"].values.tolist(),
                                                     s["Tag"].values.tolist())]
    grouped = data.groupby("Sentence #").apply(agg_func)
    return [s for s in grouped]

sentences = get_sentences(data)
#sentences[0]

In [4]:
import numpy as np



EMPTY_WORD = "______"

# get a list of all the unique words (the vocabulary)
words = list(set(data["Word"].values))
words.append(EMPTY_WORD)

# get a list of all unique tags
words_tag = list(set(data["Tag"].values))
words_tag.sort()

uniques = np.unique(data["Tag"].values, return_index=True)

# create a mapping from words to numbers
word_idx = {w : i+1 for i, w in enumerate(words)}

# create a mapping from tags to numbers
tag_idx =  {t : i for i, t in enumerate(words_tag)}

# get lengths
num_words = len(words)
num_words_tag = len(words_tag)

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# find the maximum length for the sentences
max_len = max([len(s) for s in sentences])

# create a data set in which words are replaced by numbers
x = [ [ word_idx[w[0]] for w in s ] for s in sentences ]

# shorter sentences are now padded to same length, using (index of) padding symbol
x = pad_sequences(maxlen = max_len, sequences = x, padding = 'post', value = num_words-1)

# we do the same for the y data
y = [[tag_idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen = max_len,sequences = y,padding = 'post',value = tag_idx['O'])

# but we also convert the indices to keras categories
y = [to_categorical(i, num_classes = num_words_tag) for i in  y]

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.1,random_state=1)

In [7]:
y_train[0][0:5]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.]], dtype=float32)

Things to try:
- Include pretrained embeddings
- Add dropout

In [8]:
from tensorflow.keras import models, layers, optimizers

model = models.Sequential()
model.add(layers.Input(shape = (max_len,)))
model.add(layers.Embedding(input_dim = num_words, output_dim = 20, input_length = max_len))
model.add(layers.SimpleRNN(units = 100, return_sequences = True))
model.add(layers.Dense(num_words_tag, activation = 'softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 104, 20)           703580    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 104, 100)          12100     
_________________________________________________________________
dense (Dense)                (None, 104, 17)           1717      
Total params: 717,397
Trainable params: 717,397
Non-trainable params: 0
_________________________________________________________________


In [9]:
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

In [10]:
import numpy as np

history = model.fit(
    x_train, np.array(y_train),
    batch_size = 128,
    epochs = 3,
    verbose = 1
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
model.evaluate(x_test,np.array(y_test))



[0.04035791754722595, 0.9901941418647766]

## Evaluation by class

In [12]:
from sklearn.metrics import classification_report

Y_test = np.argmax(y_test, axis=2)

y_pred = np.argmax(model.predict(x_test), axis=2)


print(classification_report(Y_test.flatten(), y_pred.flatten(), zero_division=0, target_names=uniques[0]))

              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        48
       B-eve       0.00      0.00      0.00        34
       B-geo       0.78      0.87      0.83      3767
       B-gpe       0.93      0.91      0.92      1607
       B-nat       0.00      0.00      0.00        16
       B-org       0.69      0.51      0.58      1948
       B-per       0.82      0.76      0.79      1653
       B-tim       0.90      0.75      0.82      2118
       I-art       0.00      0.00      0.00        49
       I-eve       0.00      0.00      0.00        30
       I-geo       0.88      0.47      0.62       761
       I-gpe       0.00      0.00      0.00        25
       I-nat       0.00      0.00      0.00         6
       I-org       0.65      0.68      0.66      1629
       I-per       0.82      0.82      0.82      1695
       I-tim       0.92      0.36      0.52       688
           O       1.00      1.00      1.00    482710

    accuracy              

## Challenge

In [21]:
! curl https://nilsreiter.de/assets/2020-08-31-deep-learning/ner/challenge.wb.csv > data/ner/challenge.wb.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1287k  100 1287k    0     0  2002k      0 --:--:-- --:--:-- --:--:-- 2002k


In [22]:
! curl https://nilsreiter.de/assets/2020-08-31-deep-learning/ner/challenge.bc.csv > data/ner/challenge.bc.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  886k  100  886k    0     0  1758k      0 --:--:-- --:--:-- --:--:-- 1754k


In [23]:
! curl https://nilsreiter.de/assets/2020-08-31-deep-learning/ner/challenge.nw.csv > data/ner/challenge.nw.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1547k  100 1547k    0     0  2105k      0 --:--:-- --:--:-- --:--:-- 2102k


In [25]:
challenge = pd.read_csv("data/ner/challenge.wb.csv", header = 0, names=["Sentence #","Word","POS","Tag"])
challenge.head()

sentences = get_sentences(challenge)

In [26]:
x_challenge = [[word_idx.get(w[0],0) for w in s] for s in sentences]
x_challenge = pad_sequences(maxlen = max_len,sequences = x_challenge,padding = 'post',value = num_words-1)
y_challenge = [[tag_idx[w[2]] for w in s] for s in sentences]
y_challenge = pad_sequences(maxlen = max_len,sequences = y_challenge,padding =
                        'post',value = tag_idx['O'])
y_challenge = [to_categorical(i,num_classes = num_words_tag) for i in  y_challenge]

In [27]:
model.evaluate(x_challenge, np.array(y_challenge))



[0.12893670797348022, 0.9777546525001526]

In [30]:
# TODO: fix label handling

Y_test = np.argmax(y_challenge, axis=2)

y_pred = np.argmax(model.predict(x_challenge), axis=2)

print(classification_report(Y_test.flatten(), y_pred.flatten(), zero_division=0, target_names=uniques[0]))

ValueError: Number of classes, 13, does not match size of target_names, 17. Try specifying the labels parameter