### Homework on PropBank and semantic roles tagging
#### Tatiana Shavrina

In [1]:
from conllu.parser import parse, parse_tree
import requests
import re
import nltk
import pycrfsuite
from sklearn.metrics import *

#### Parsing the data

In [2]:
url = 'https://raw.githubusercontent.com/TurkuNLP/Finnish_PropBank/data/fipb-ud-train.conllu'
response = requests.get(url)
data = response.text

In [3]:

data = re.sub(r" +", r"\t", data)

data_dic = parse(data)

In [4]:
len(data_dic)

12217

In [5]:
data_dic[2] #data_dic is a list of dicts, where for each sentence there is an OrderedDict with word and its given features - 
# Training features: id, form, lemma, upostag, xpostag, feats, head, misc
# Testing - deprel and deps

[OrderedDict([('id', 1),
              ('form', 'Vähän'),
              ('lemma', 'vähän'),
              ('upostag', 'ADV'),
              ('xpostag', 'Adv'),
              ('feats', None),
              ('head', 2),
              ('deprel', 'advmod'),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 2),
              ('form', 'samanlainen'),
              ('lemma', 'samanlainen'),
              ('upostag', 'ADJ'),
              ('xpostag', 'A'),
              ('feats',
               OrderedDict([('Case', 'Nom'),
                            ('Degree', 'Pos'),
                            ('Number', 'Sing')])),
              ('head', 3),
              ('deprel', 'amod'),
              ('deps', None),
              ('misc', None)]),
 OrderedDict([('id', 3),
              ('form', 'tunne'),
              ('lemma', 'tunne'),
              ('upostag', 'NOUN'),
              ('xpostag', 'N'),
              ('feats', OrderedDict([('Case', 'Nom'), ('Number', 

In [6]:
# if I want to get a feature of a word in a sentence, i will use:
data_dic[2][1]['lemma']

'samanlainen'

### Procedure
Here we have a task on sequence labelling, where we can use, for example, HMM with a bunch of chains of every feature sequence (I will be useing CRF, actually, as we have long-distance dependencies in the data). It seems easy at the first glance, nevertheless test data has only words.
As our home task included training on features, we cannot predict features, and we will take the golden standard provided, and then use the semantic role columns from it as an exact test set, andall other info - as given data for prediction.

In [7]:
url_gold = 'https://raw.githubusercontent.com/TurkuNLP/Finnish_PropBank/data/fipb-ud-test.conllu'
data_test = requests.get(url_gold).text
data_test = re.sub(r" +", r"\t", data_test)
data_test_dic = parse(data_test)

In [8]:
data_test_dic[5]

[OrderedDict([('id', 1),
              ('form', 'Missään'),
              ('lemma', 'mikään'),
              ('upostag', 'PRON'),
              ('xpostag', 'Pron'),
              ('feats',
               OrderedDict([('Case', 'Ine'),
                            ('Number', 'Sing'),
                            ('PronType', 'Ind')])),
              ('head', 3),
              ('deprel', 'nmod'),
              ('deps', '3:PBArg_2'),
              ('misc', None)]),
 OrderedDict([('id', 2),
              ('form', 'ei'),
              ('lemma', 'ei'),
              ('upostag', 'VERB'),
              ('xpostag', 'V'),
              ('feats',
               OrderedDict([('Negative', 'Neg'),
                            ('Number', 'Sing'),
                            ('Person', '3'),
                            ('VerbForm', 'Fin'),
                            ('Voice', 'Act')])),
              ('head', 3),
              ('deprel', 'neg'),
              ('deps', '3:PBArgM_neg'),
              ('mis

### Data preparation and additional feature extraction

In [9]:
# returning training features
# Training features: id, form, lemma, upostag, xpostag, feats, head, misc
def word2features(sent, i):
    id = sent[i]['id']
    word = sent[i]['form']
    lemma = sent[i]['lemma']
    upostag = sent[i]['upostag']
    xpostag = sent[i]['xpostag']
    #feats = sent[i]['feats'] - I will not include feats as they have dictionary format
    head = sent[i]['head']
    misc = sent[i]['misc']

    # Common features for all words
    features = [
        'word.id=' + str(id),
        'word.lower=' + word.lower(),
        'word.lemma=%s' % lemma,
        'word.upostag=%s' % upostag,
        'word.xpostag=%s' % xpostag,
        'word.head=%s' % head,
        'word.misc=%s' % misc
    ]


    return features

In [13]:
# returning training features
# Testing - deprel and deps
def word2label1(sent, i):

    # labels for all words
    feature = 'word.deprel=%s' % sent[i]['deprel']


    return feature

def word2label2(sent, i):
    
    # labels for all words
    feature = 'word.deps=%s' % sent[i]['deps']
    return feature

In [14]:
# A function for extracting features in documents
def extract_features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# A function fo generating the list of labels for each document
def get_label1(sent):
    return [word2label1(sent, i) for i in range(len(sent))]
def get_label2(sent):
    return [word2label2(sent, i) for i in range(len(sent))]

In [15]:
X_train = [extract_features(doc) for doc in data_dic]
y_train = [get_label1(doc) for doc in data_dic]

X_test = [extract_features(doc) for doc in data_test_dic]
y_test = [get_label1(doc) for doc in data_test_dic]

In [16]:
y_train[3]

['word.deprel=advmod',
 'word.deprel=xcomp',
 'word.deprel=nsubj',
 'word.deprel=advmod',
 'word.deprel=root',
 'word.deprel=punct']

In [17]:
X_train[3]

[['word.id=1',
  'word.lower=kuinka',
  'word.lemma=kuinka',
  'word.upostag=ADV',
  'word.xpostag=Adv',
  'word.head=2',
  'word.misc=None'],
 ['word.id=2',
  'word.lower=erilaiselta',
  'word.lemma=erilainen',
  'word.upostag=ADJ',
  'word.xpostag=A',
  'word.head=5',
  'word.misc=None'],
 ['word.id=3',
  'word.lower=maailma',
  'word.lemma=maailma',
  'word.upostag=NOUN',
  'word.xpostag=N',
  'word.head=5',
  'word.misc=None'],
 ['word.id=4',
  'word.lower=sieltä',
  'word.lemma=sieltä',
  'word.upostag=ADV',
  'word.xpostag=Adv',
  'word.head=5',
  'word.misc=None'],
 ['word.id=5',
  'word.lower=näyttikään',
  'word.lemma=näyttää',
  'word.upostag=VERB',
  'word.xpostag=V',
  'word.head=0',
  "word.misc=OrderedDict([('PBSENSE', 'näyttää.3'), ('SpaceAfter', 'No')])"],
 ['word.id=6',
  'word.lower=.',
  'word.lemma=.',
  'word.upostag=PUNCT',
  'word.xpostag=Punct',
  'word.head=5',
  'word.misc=None']]

### Training

#### First of all, we will try to guess only deprel labels

In [18]:

trainer = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

# Set the parameters of the model
trainer.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer.train('crf_deprel.model')



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 116800
Seconds required: 0.353

L-BFGS optimization
c1: 0.100000
c2: 0.010000
num_memories: 6
max_iterations: 200
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20



#### Then depr labels

In [19]:
X_train2 = [extract_features(doc) for doc in data_dic]
y_train2 = [get_label2(doc) for doc in data_dic]

X_test2 = [extract_features(doc) for doc in data_test_dic]
y_test2 = [get_label2(doc) for doc in data_test_dic]

In [20]:

trainer2 = pycrfsuite.Trainer(verbose=True)

# Submit training data to the trainer
for xseq, yseq in zip(X_train, y_train):
    trainer2.append(xseq, yseq)

# Set the parameters of the model
trainer2.set_params({
    # coefficient for L1 penalty
    'c1': 0.1,

    # coefficient for L2 penalty
    'c2': 0.01,  

    # maximum number of iterations
    'max_iterations': 200,

    # whether to include transitions that
    # are possible, but not observed
    'feature.possible_transitions': True
})

# Provide a file name as a parameter to the train function, such that
# the model will be saved to the file when training is finished
trainer2.train('crf_deps.model')



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10



***** Iteration #46 *****



### Testing

#### deprel labelling quality

In [21]:
tagger = pycrfsuite.Tagger()
tagger.open('crf_deprel.model')
y_pred = [tagger.tag(xseq) for xseq in X_test]


In [22]:
predictions =  [item for sublist in y_pred for item in sublist]# a flat list of all predicions
y_test = [item for sublist in y_test for item in sublist] # a list of all gold labels

In [23]:

print("Quality on deprel")
print("Precision: {0:6.2f}".format(precision_score(y_test, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test, predictions)))
print(classification_report(y_test, predictions))

Quality on deprel
Precision:   0.65
Recall:   0.64
F1-measure:   0.63
Accuracy:   0.81
                          precision    recall  f1-score   support

         word.deprel=acl       0.76      0.84      0.79       159
   word.deprel=acl:relcl       0.32      0.32      0.32        95
       word.deprel=advcl       0.44      0.36      0.40       163
      word.deprel=advmod       0.97      0.96      0.96       646
        word.deprel=amod       0.93      0.97      0.95       429
       word.deprel=appos       0.63      0.39      0.48        69
         word.deprel=aux       0.98      0.89      0.93       147
     word.deprel=auxpass       0.52      0.85      0.64        20
        word.deprel=case       0.99      0.99      0.99       140
          word.deprel=cc       0.98      0.99      0.98       383
  word.deprel=cc:preconj       0.60      0.50      0.55         6
       word.deprel=ccomp       0.38      0.28      0.32        95
    word.deprel=compound       0.88      0.68      0.7

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### So we got 81% accuracy

#### deps labelling quality

In [24]:
tagger2 = pycrfsuite.Tagger()
tagger2.open('crf_deps.model')
y_pred = [tagger2.tag(xseq) for xseq in X_test2]
predictions =  [item for sublist in y_pred for item in sublist]# a flat list of all predicions
y_test2 = [item for sublist in y_test2 for item in sublist] # a list of all gold labels

In [25]:

print("Quality on deps")
print("Precision: {0:6.2f}".format(precision_score(y_test2, predictions, average='macro')))
print("Recall: {0:6.2f}".format(recall_score(y_test2, predictions, average='macro')))
print("F1-measure: {0:6.2f}".format(f1_score(y_test2, predictions, average='macro')))
print("Accuracy: {0:6.2f}".format(accuracy_score(y_test2, predictions)))
print(classification_report(y_test, predictions))

Quality on deps
Precision:   0.65
Recall:   0.64
F1-measure:   0.63
Accuracy:   0.81
                          precision    recall  f1-score   support

         word.deprel=acl       0.76      0.84      0.79       159
   word.deprel=acl:relcl       0.32      0.32      0.32        95
       word.deprel=advcl       0.44      0.36      0.40       163
      word.deprel=advmod       0.97      0.96      0.96       646
        word.deprel=amod       0.93      0.97      0.95       429
       word.deprel=appos       0.63      0.39      0.48        69
         word.deprel=aux       0.98      0.89      0.93       147
     word.deprel=auxpass       0.52      0.85      0.64        20
        word.deprel=case       0.99      0.99      0.99       140
          word.deprel=cc       0.98      0.99      0.98       383
  word.deprel=cc:preconj       0.60      0.50      0.55         6
       word.deprel=ccomp       0.38      0.28      0.32        95
    word.deprel=compound       0.88      0.68      0.76 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
