<div style="text-align: right"><a href="http://ml-school.uni-koeln.de">Virtual Summer School "Deep Learning for
    Language Analysis"</a> <br/><strong>Text Analysis with Deep Learning</strong><br/>Aug 30 — Sep 3, 2021<br/>Nils Reiter<br/><a href="mailto:nils.reiter@uni-koeln.de">nils.reiter@uni-koeln.de</a></div>

# Exercise 3



In [1]:
import pandas as pd
import numpy as np

In [2]:
# read in CSV file
data = pd.read_csv("data/ner/gmb.csv", encoding = 'latin1')

# the first column of the file contains the sentence number
# -- but only for the first token of each sentence.
# The following line fills the rows downwards.
data = data.fillna(method = 'ffill')

In [3]:
# create a list of unique words and assign an integer number to it
unique_words, coded_words = np.unique(data["Word"], return_inverse=True)
data["Word_idx"] = coded_words
EMPTY_WORD_IDX = len(unique_words)
np.array(unique_words.tolist().append("_____"))
num_words = len(unique_words)+1

unique_pos_tags, coded_pos_tags = np.unique(data["POS"], return_inverse=True)
data["POS_idx"]  = coded_pos_tags
NO_POS_TAG_IDX = len(unique_pos_tags)
unique_pos_tags = unique_pos_tags.tolist()
unique_pos_tags.append("_")
unique_pos_tags = np.array(unique_pos_tags)
num_pos_tags = len(unique_pos_tags)


# create a list of unique tags and assign an integer number to it
unique_ne_tags, coded_ne_tags = np.unique(data["Tag"], return_inverse=True)
data["NE_idx"]  = coded_ne_tags
NO_NE_TAG_IDX = unique_ne_tags.tolist().index("O")
num_ne_tags = len(unique_ne_tags)

# for verification and inspection, we can inspect the table so far
data[1:20]

Unnamed: 0,Sentence #,Word,POS,Tag,Word_idx,POS_idx,NE_idx
1,Sentence: 1,of,IN,O,27700,10,16
2,Sentence: 1,demonstrators,NNS,O,20969,19,16
3,Sentence: 1,have,VBP,O,24218,35,16
4,Sentence: 1,marched,VBN,O,26434,34,16
5,Sentence: 1,through,IN,O,33389,10,16
6,Sentence: 1,London,NNP,B-geo,9684,17,2
7,Sentence: 1,to,TO,O,33464,29,16
8,Sentence: 1,protest,VB,O,29396,31,16
9,Sentence: 1,the,DT,O,33246,7,16
10,Sentence: 1,war,NN,O,34660,16,16


In [4]:
# We are interested in sentence-wise processing.
# Therefore, we use a function that gives us individual sentences.
def get_sentences(data):
  n_sent=1
  agg_func = lambda s:[(w,p,t) 
    for w,p,t in zip(
      s["Word_idx"].values.tolist(),
      s["POS_idx"].values.tolist(),
      s["NE_idx"].values.tolist())]
  grouped = data.groupby("Sentence #").apply(agg_func)
  return [s for s in grouped]

sentences = get_sentences(data)

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# find the maximum length for the sentences
max_len = max([len(s) for s in sentences])

# extract the word index
x = [ [ w[0] for w in s ] for s in sentences ]

# extract the tag index
y_pos = [ [ w[1] for w in s ] for s in sentences ]
y_ne = [ [ w[2] for w in s ] for s in sentences ]

# shorter sentences are now padded to same length, using (index of) padding symbol
x = pad_sequences(maxlen = max_len, sequences = x, 
  padding = 'post', value = EMPTY_WORD_IDX)

# we do the same for the y data
y_ne = pad_sequences(maxlen = max_len, sequences = y_ne, 
  padding = 'post', value = NO_NE_TAG_IDX)
y_pos = pad_sequences(maxlen = max_len, sequences = y_pos, 
  padding = 'post', value = NO_POS_TAG_IDX)

y_ne = np.array(y_ne)
y_pos = np.array(y_pos)

# but we also convert the indices to one-hot-encoding
y_ne = to_categorical(y_ne, num_classes = num_ne_tags)
y_pos = to_categorical(y_pos, num_classes = num_pos_tags)


In [6]:
# split the data into training and test data
from sklearn.model_selection import train_test_split

x_train,x_test,y_ne_train,y_ne_test,train_indices,test_indices = train_test_split(x, y_ne, range(len(x)), test_size = 0.1, random_state=1)

y_pos_train = y_pos[train_indices]
y_pos_test = y_pos[test_indices]

y_train_weights = np.array([ [ 0.1 if w[len(w)-1] == 1 else 1 for w in s ]  for s in y_ne_train ])

In [7]:
from tensorflow.keras import models, layers, optimizers

l_input = layers.Input(shape = (max_len,))
l_embedding = layers.Embedding(input_dim = num_words, output_dim = 50, input_length = max_len)(l_input)
l_lstm = layers.LSTM(units = 5, return_sequences = True)(l_embedding)
l_output_ne = layers.Dense(num_ne_tags, name="ne", activation = 'softmax')(l_lstm)
l_output_pos = layers.Dense(num_pos_tags, name="pos", activation = 'softmax')(l_lstm)

model = models.Model(inputs = l_input, outputs=[l_output_ne, l_output_pos])

model.summary()

# We use a different optimizer this time
model.compile(optimizer='Adam', 
  loss = 'categorical_crossentropy', metrics = ['accuracy'])




Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 104)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 104, 50)      1758950     input_1[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 104, 5)       1120        embedding[0][0]                  
__________________________________________________________________________________________________
ne (Dense)                      (None, 104, 17)      102         lstm[0][0]                       
______________________________________________________________________________________________

In [8]:
history = model.fit(
    x_train, [np.array(y_ne_train), np.array(y_pos_train)],
    batch_size = 64,
    epochs = 2,
    verbose = 1
)

Epoch 1/2
Epoch 2/2


In [9]:
model.evaluate(x_test, [y_ne_test, y_pos_test])



[0.7328184843063354,
 0.13522876799106598,
 0.5975896120071411,
 0.9677796363830566,
 0.8492814302444458]

In [10]:
# Reverse one-hot-encoding for test data
y_ne_test = np.argmax(y_ne_test, axis=2)
y_pos_test = np.argmax(y_pos_test, axis=2)


In [11]:
from sklearn.metrics import classification_report

y_ne_pred, y_pos_pred = model.predict(x_test)

y_ne_pred = np.argmax(y_ne_pred, axis=2)
y_pos_pred = np.argmax(y_pos_pred, axis=2)

print(classification_report(y_ne_test.flatten(), y_ne_pred.flatten(), zero_division=0, target_names=unique_ne_tags))
print(classification_report(y_pos_test.flatten(), y_pos_pred.flatten(), zero_division=0, target_names=unique_pos_tags))


              precision    recall  f1-score   support

       B-art       0.00      0.00      0.00        48
       B-eve       0.00      0.00      0.00        34
       B-geo       0.00      0.00      0.00      3767
       B-gpe       0.00      0.00      0.00      1607
       B-nat       0.00      0.00      0.00        16
       B-org       0.18      0.00      0.00      1948
       B-per       0.00      0.00      0.00      1653
       B-tim       0.00      0.00      0.00      2118
       I-art       0.00      0.00      0.00        49
       I-eve       0.00      0.00      0.00        30
       I-geo       0.00      0.00      0.00       761
       I-gpe       0.00      0.00      0.00        25
       I-nat       0.00      0.00      0.00         6
       I-org       0.00      0.00      0.00      1629
       I-per       0.00      0.00      0.00      1695
       I-tim       0.00      0.00      0.00       688
           O       0.97      1.00      0.98    482710

    accuracy              