In [1]:
# ! pip install sklearn-crfsuite
# https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html
import os
import time
import numpy as np
import pandas as pd
from sklearn_crfsuite import CRF, metrics
from IPython.display import display, clear_output
os.chdir(os.getcwd())

In [2]:
data = pd.read_csv('train_data_features_alligned.csv')
data.head()

Unnamed: 0,document,sentence,token,word,lemma,pos,tag,dependency,shape_,is_alpha,is_stop,trigram,negation
0,baskervilles01,0,0,Chapter,chapter,NOUN,NN,ROOT,Xxxxx,1,0,Chapter 1 Mr.,O
1,baskervilles01,0,1,1.,1,NUM,CD,nummod,d,0,0,Chapter 1 Mr.,O
2,baskervilles01,0,2,Mr.,Mr.,PROPN,NNP,compound,Xx.,0,0,1 Mr. Sherlock,O
3,baskervilles01,0,3,Sherlock,Sherlock,PROPN,NNP,compound,Xxxxx,1,0,Mr. Sherlock Holmes,O
4,baskervilles01,0,4,Holmes,Holmes,PROPN,NNP,ROOT,Xxxxx,1,0,Sherlock Holmes Mr.,O


In [4]:
data[(data.sentence == 16) & (data.document == 'baskervilles01')]

Unnamed: 0,document,sentence,token,word,lemma,pos,tag,dependency,shape_,is_alpha,is_stop,trigram,negation
328,baskervilles01,16,0,``,``,PUNCT,``,punct,`,0,0,'' `` Good,O
329,baskervilles01,16,1,Good,good,ADJ,JJ,ROOT,Xxxx,1,0,`` Good !,O
330,baskervilles01,16,2,!,!,PUNCT,.,punct,!,0,0,Good ! '',O
331,baskervilles01,16,3,'','',PUNCT,'',punct,'',0,0,! '' said,O
332,baskervilles01,16,4,said,say,VERB,VBD,ROOT,xxxx,1,0,'' said Holmes,O
333,baskervilles01,16,5,Holmes,Holmes,PROPN,NNP,nsubj,Xxxxx,1,0,said Holmes .,O
334,baskervilles01,16,6,.,.,PUNCT,.,punct,.,0,0,Holmes . ``,O


# Final Pre-Processing

In [2]:
def generate_input_data(data):
    data['prev_word'] = data.lemma.shift()
    data['next_word'] = data.lemma.shift(-1)

    data['trigram'] = data.lemma.shift() + ' ' +  data.lemma + ' ' + data.lemma.shift(-1)
    data.loc[0, 'trigram'] = data.trigram[1]
    data.loc[len(data)-1, 'trigram'] = data.trigram[len(data)-2]

    data = data[['document', 'sentence', 'token', 'word', 'lemma', 'prev_word', 'next_word', 'trigram',
                 'pos', 'tag', 'dependency', 'shape_', 'is_alpha', 'is_stop', 'negation']]
    
    features = data.columns[2:-1].values
    response = data.columns[-1]
    
    input_x = []
    input_y = []

    doc_len = len(data.document.unique())
    for i, document_id in enumerate(data.document.unique()):
        print(f'\rDocument: {i+1}/{doc_len}', end='')

        for sentence_id in data[data.document == document_id].sentence.unique():

            sentence_data = data[(data.document == document_id) & (data.sentence == sentence_id)]

            input_x.append([sentence_data.loc[row, features].to_dict() for row in sentence_data.index])
            input_y.append(sentence_data[response].values)
    
    return input_x, input_y

In [3]:
train_data = pd.read_csv('train_data_features_alligned.csv')
train_x, train_y = generate_input_data(train_data)

Document: 14/14

In [4]:
val_data = pd.read_csv('val_data_features_alligned.csv')
val_x, val_y = generate_input_data(val_data)

Document: 2/2

# Conditional Random Fields (CRF)

In [5]:
%%time
crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=1000,
          all_possible_transitions=True)

try:
    crf.fit(train_x, train_y)
except AttributeError:
    pass

Wall time: 20.3 s


## Validation

In [6]:
prediction = crf.predict(val_x)

In [7]:
labels = list(crf.classes_)
labels.remove('O')
metrics.flat_f1_score(val_y, prediction, average='weighted', labels=labels)

0.8813297468790952

## Test

In [11]:
test_data = pd.read_csv('test_data_features_alligned.csv')
test_x, test_y = generate_input_data(test_data)

Document: 3/3

In [12]:
prediction = crf.predict(test_x)

labels = list(crf.classes_)
labels.remove('O')
metrics.flat_f1_score(test_y, prediction, average='weighted', labels=labels)

0.8807171710013464