In [1]:
import copy
import json
import pandas as pd
import re
from tqdm import tqdm
import numpy as np

In [2]:
def read_data(file_name):
    json_data = []
    infile = open(file_name, "r")
    for line in infile:
        json_data.append(json.loads(line))
    infile.close()
    return json_data

In [3]:
def combine_data(json_data):
    combined_data = {}
    for data in json_data:
        key = f"{data['source_id']} {data['speech_id']} {data['paragraph_id']}"
        if key not in combined_data:
            combined_data[key] = copy.deepcopy(data)
        else:
            if len(data["data"]) != len(combined_data[key]["data"]):
                print("cannot happen")
            for label_data in data["label"]:
                if label_data not in combined_data[key]["label"]:
                    combined_data[key]["label"].append(label_data)
                    if combined_data[key]["label"][-1][1] > len(combined_data[key]["data"]):
                        combined_data[key]["label"][-1][1] = len(combined_data[key]["data"])
    for key in combined_data:
        for label_data in combined_data[key]["label"]:
            label_data.append(combined_data[key]["data"][label_data[0]:label_data[1]])
    return combined_data

In [4]:
json_data = read_data("../data/femke.jsonl")
combined_data = combine_data(json_data)

In [5]:
len(combined_data)

526

## Retrieve features with stanza
Stanza gives us al kind of tags. We also use it for tokenizing

In [2]:
import stanza

In [7]:
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2021-10-15 10:58:23 INFO: Downloading default packages for language: en (English)...
2021-10-15 10:58:24 INFO: File exists: /home/dafne/stanza_resources/en/default.zip.
2021-10-15 10:58:29 INFO: Finished downloading models and saved to /home/dafne/stanza_resources.


In [10]:
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma,depparse,ner') 

2021-10-15 10:59:42 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| pos       | combined  |
| lemma     | combined  |
| depparse  | combined  |
| ner       | ontonotes |

2021-10-15 10:59:42 INFO: Use device: cpu
2021-10-15 10:59:42 INFO: Loading: tokenize
2021-10-15 10:59:42 INFO: Loading: pos
2021-10-15 10:59:43 INFO: Loading: lemma
2021-10-15 10:59:44 INFO: Loading: depparse
2021-10-15 10:59:45 INFO: Loading: ner
2021-10-15 10:59:47 INFO: Done loading processors!


In [11]:
for key in tqdm(combined_data.keys()):
    combined_data[key]['stanza_tokens'] = nlp(combined_data[key]['data'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 526/526 [18:34<00:00,  2.12s/it]


### Create the BIO tags
We need to map the labels, that consist of character spans, to spans of tokens. Then we can tag the tokens with B-I-O labels.We do this per sentence, so we get a list of (token, tag) tuples per sentence.

In [46]:
def map_labels(stanza_doc, labels, doc_key=''):
    tokens = list(stanza_doc.iter_tokens())
    char_to_tokenidx = np.repeat(None, len(stanza_doc.text))
    for token_id, token in enumerate(tokens):
        char_to_tokenidx[token.start_char:token.end_char] = token_id

    new_label_list = {}
    for i, (s,e,rel,text) in enumerate(labels):   
        s_idx = char_to_tokenidx[s]
        e_idx = char_to_tokenidx[e-1]
        if s_idx is None:
            # start index is a space so not inside token
            print('{} - Warning: start index is space'.format(doc_key))
            s_idx = char_to_tokenidx[s+1]
        if e_idx is None:
            # End index is space so not inside token
            print('{} - Warning: end index is space'.format(doc_key))
            e_idx = char_to_tokenidx[e-2]
        subtext = stanza_doc.text[tokens[s_idx].start_char : tokens[e_idx].end_char]
        if text.strip() != subtext:
            print('{} - Warning: label "{}" does not match tokens "{}". Skipping this label'.format(
                doc_key, text, subtext
            ))
        else:
            new_label_list[i] = (s_idx, e_idx)
    return tokens, new_label_list

In [43]:
from collections import defaultdict

def get_BIO_tags(tokens, label_dict):
    tags = np.repeat('O', len(tokens))
    for i, (s,e) in label_dict.items():
        tags[s] = 'B'
        tags[s+1:e] = 'I'
        
    sent_dict = defaultdict(list)
    for token_id, token in enumerate(tokens):
        sent_dict[token.sent.id].append((token.text, tags[token_id]))
    return [sent_dict[k] for k in sorted(sent_dict.keys())]

In [47]:
for key in combined_data:
    doc = combined_data[key]['stanza_tokens']
    rel_labels = [l for l in combined_data[key]['label'] if l[2]=='Content_Relation_Explanation']
    tokens, label_dict = map_labels(doc, rel_labels, doc_key=key)
    combined_data[key]['tags'] = get_BIO_tags(tokens, label_dict)



Note that some annotations seem to be incorrect, they do not match the tokenization. We skip those for now.

## Save data
We convert all of the stanza output to a dictionary and write that out to a file, so we don't have to run stanza again next time.
We exclude the stanza document object because it's not json serializable.

In [48]:
for doc_key in combined_data:
    doc = combined_data[doc_key]['stanza_tokens']
    combined_data[doc_key]['stanza_output'] = doc.to_dict()

In [49]:
reduced_data = {doc_key: {k: v for k,v in data.items() if k!='stanza_tokens'}
               for doc_key, data in combined_data.items()}

In [3]:
output_file = '../data/femke-parsed.json'

In [50]:
with open(output_file, 'w') as fout:
    json.dump(reduced_data, fout)

In [4]:
with open(output_file, 'r') as fin:
    reduced_data = json.load(fin)

Let's see what the stanza output looks like for an example doc:

In [5]:
example_key = list(reduced_data.keys())[0]
stanza_doc = reduced_data[example_key]['stanza_output']
stanza_doc[0][:5]

[{'id': 1,
  'text': 'Today',
  'lemma': 'today',
  'upos': 'NOUN',
  'xpos': 'NN',
  'feats': 'Number=Sing',
  'head': 3,
  'deprel': 'obl:tmod',
  'start_char': 0,
  'end_char': 5,
  'ner': 'S-DATE'},
 {'id': 2,
  'text': 'I',
  'lemma': 'I',
  'upos': 'PRON',
  'xpos': 'PRP',
  'feats': 'Case=Nom|Number=Sing|Person=1|PronType=Prs',
  'head': 3,
  'deprel': 'nsubj',
  'start_char': 6,
  'end_char': 7,
  'ner': 'O'},
 {'id': 3,
  'text': 'want',
  'lemma': 'want',
  'upos': 'VERB',
  'xpos': 'VBP',
  'feats': 'Mood=Ind|Tense=Pres|VerbForm=Fin',
  'head': 0,
  'deprel': 'root',
  'start_char': 8,
  'end_char': 12,
  'ner': 'O'},
 {'id': 4,
  'text': 'to',
  'lemma': 'to',
  'upos': 'PART',
  'xpos': 'TO',
  'head': 5,
  'deprel': 'mark',
  'start_char': 13,
  'end_char': 15,
  'ner': 'O'},
 {'id': 5,
  'text': 'send',
  'lemma': 'send',
  'upos': 'VERB',
  'xpos': 'VB',
  'feats': 'VerbForm=Inf',
  'head': 3,
  'deprel': 'xcomp',
  'start_char': 16,
  'end_char': 20,
  'ner': 'O'}]

## Put together train and test data
We split up the paragraphs in sentences, and divide the collection of sentences in train and test. We select a limited number of features from the stanza output, and also include the features of surrounding tokens.

In [6]:
def expand_features(sentence, context_window=1, features=['text', 'lemma', 'upos', 'xpos', 'deprel', 'ner']):
    # Copy the exisiting features to the output list
    output = [{feat: token[feat] for feat in features} for token in sentence]
    
    # Now include the context
    for i in range(context_window, len(sentence)-context_window):
        for feat in features:
            for j in range(1, context_window+1):
                name = 'min{}_{}'.format(j, feat)
                output[i][name] = sentence[i-j][feat]
                name = 'plus{}_{}'.format(j, feat)
                output[i][name] = sentence[i+j][feat]
                
    # Also include special features for beginning and ending tokens
    for i in range(context_window):
        output[i]['BOS'] = True
        output[len(output)-i-1]['EOS'] = True
    return output


In [7]:

all_features = [ expand_features(sent)
    for k,data in reduced_data.items()
    for sent in data['stanza_output']
]


all_tags = [[tag for tok,tag in sent]
    for k,data in reduced_data.items()
    for sent in data['tags']
]

print(len(all_features), len(all_tags))

2605 2605


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    all_features, all_tags, test_size=0.20, random_state=0)

In [9]:
print(len(X_train), len(X_test))

2084 521


To get an idea how balanced our dataset is, let's check how many of the sentences contain at least one B and/or I tag.

In [10]:
contains_rel = [np.any([tag!='O' for tag in sent]) for sent in y_train]
np.mean(contains_rel)

0.345489443378119

We can visually inspect the features of one sentence with a pandas DataFrame:

In [11]:
pd.DataFrame.from_records(X_train[0])

Unnamed: 0,text,lemma,upos,xpos,deprel,ner,BOS,min1_text,plus1_text,min1_lemma,plus1_lemma,min1_upos,plus1_upos,min1_xpos,plus1_xpos,min1_deprel,plus1_deprel,min1_ner,plus1_ner,EOS
0,They,they,PRON,PRP,nsubj,O,True,,,,,,,,,,,,,
1,took,take,VERB,VBD,root,O,,They,the,they,the,PRON,DET,PRP,DT,nsubj,det,O,O,
2,the,the,DET,DT,det,O,,took,form,take,form,VERB,NOUN,VBD,NN,root,obj,O,O,
3,form,form,NOUN,NN,obj,O,,the,of,the,of,DET,ADP,DT,IN,det,case,O,O,
4,of,of,ADP,IN,case,O,,form,a,form,a,NOUN,DET,NN,DT,obj,det,O,O,
5,a,a,DET,DT,det,O,,of,prohibition,of,prohibition,ADP,NOUN,IN,NN,case,nmod,O,O,
6,prohibition,prohibition,NOUN,NN,nmod,O,,a,of,a,of,DET,ADP,DT,IN,det,case,O,O,
7,of,of,ADP,IN,case,O,,prohibition,monetary,prohibition,monetary,NOUN,ADJ,NN,JJ,nmod,amod,O,O,
8,monetary,monetary,ADJ,JJ,amod,O,,of,financing,of,financing,ADP,NOUN,IN,NN,case,nmod,O,O,
9,financing,financing,NOUN,NN,nmod,O,,monetary,of,monetary,of,ADJ,ADP,JJ,IN,amod,case,O,O,


## Create CRF model
We use the sklearn-crfsuite package, which can take arbitrary categorical features as input. See [this tutorial](https://github.com/TeamHG-Memex/sklearn-crfsuite/blob/master/docs/CoNLL2002.ipynb)

In [51]:
!pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting tabulate
  Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743 kB)
[K     |████████████████████████████████| 743 kB 3.9 MB/s eta 0:00:01
[?25hInstalling collected packages: tabulate, python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6 tabulate-0.8.9


Unfortunatly, sklearn-crfsuite does not work with recent scikit-learn versions (https://github.com/TeamHG-Memex/sklearn-crfsuite/issues/60) so we have to downgrade the version

In [105]:
!pip install -U 'scikit-learn<0.24'

Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 1.8 MB/s eta 0:00:01     |████▉                           | 1.0 MB 3.8 MB/s eta 0:00:02
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.24.2
    Uninstalling scikit-learn-0.24.2:
      Successfully uninstalled scikit-learn-0.24.2
Successfully installed scikit-learn-0.23.2


In [12]:
import sklearn_crfsuite
import sklearn

In [14]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

## Evaluation
We can calculate metrics for all labels, but since there are many O's, we can also calculate only for B and I tags

In [15]:
import sklearn_crfsuite.metrics

y_pred = crf.predict(X_test)

In [18]:
print(sklearn_crfsuite.metrics.flat_classification_report(
    y_test, y_pred, digits=3
))

              precision    recall  f1-score   support

           B      0.338     0.121     0.178       215
           I      0.320     0.121     0.175       199
           O      0.970     0.992     0.981     11763

    accuracy                          0.962     12177
   macro avg      0.543     0.411     0.445     12177
weighted avg      0.948     0.962     0.953     12177





In [26]:
labels = ['B', 'I']
sklearn_crfsuite.metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.17668836981036198

In [27]:
# Compare to F1 on training set, to see wheter we are overfitting
y_train_pred = crf.predict(X_train)
sklearn_crfsuite.metrics.flat_f1_score(y_train, y_train_pred, 
                      average='weighted', labels=labels)

0.7958975583021689

In [28]:
# Print out some example sentences where the CRF did find relations
num_example = 10
i = 0
for sent, pred_tags, true_tags in zip(X_test, y_pred, y_test):
    if 'B' in pred_tags or 'I' in pred_tags:
        if i < num_example:
            print(' '.join([tok['text'] for tok in sent]))
            print('Predicted:', ' '.join(tok['text'] for tok,tag in zip(sent, pred_tags) if tag!='O'))
            print('True:', ' '.join(tok['text'] for tok,tag in zip(sent, true_tags) if tag!='O'))
            print()
            i += 1

To the extent that higher market interest rates in the euro area lead to pressure on the krone , Danmarks Nationalbank will follow its usual practice , including raising interest rates if necessary .
Predicted: lead
True: lead

It should be fairer so that everyone contributes to the welfare system according to their means .
Predicted: contributes
True: so

I believe that the key to this mystery lies in the fact that the euro represents a commitment device , a policy straitjacket which , if accompanied by the behaviour to which the country implicitly and explicitly commits when it joins the currency union , will lead to an improved economic performance relative to what can otherwise be achieved .
Predicted: will lead
True: will lead

In combination with high current account deficits , resulting from lack of competitiveness , this leaves these countries especially vulnerable to changes of sentiment in the international capital markets ( slide 4 - current account ) .
Predicted: resulting
