In [1]:
import pickle
import copy
import numpy as np
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
# from sklearn.cross_validation import cross_val_score
# from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [2]:
def get_sent_labels(token_list):
    sent_labels, sentences, sent_start = [], [], 0
    for i, line in enumerate(token_list):
        if line == '\n':
            sentences.append(sent_labels)
            sent_labels = []
        else:        
            token, label = line.rstrip().split()
            sent_labels.append(label)
    return sentences

def sent2features(sent_emb):
    features = []

    for word_emb in sent_emb:
        word_features = {}
        if len(word_emb.shape) > 0:
            for i in range(word_emb.shape[0]):
                word_features['bert_features_{}'.format(i)] = float(word_emb[i])
        else:
            word_features['bert_features_0'] = float(word_emb)
            
        features.append(copy.deepcopy(word_features))
        del word_features
    
    return features

def merge_features(bert_features, other_features):
    
    for sent_emb_features, sent_other_features in zip(bert_features, other_features):
        
        for word_emb_features, word_other_features in zip(sent_emb_features[:len(sent_other_features)], sent_other_features):
            word_other_features.update(word_emb_features)
        
        if len(sent_other_features) > len(sent_emb_features):
            for _ in range(len(sent_other_features)-len(sent_emb_features)):
                sent_other_features.pop()

In [6]:
wm1 = open('../../data_wm/arg_clean_45_1/test.txt','r').readlines()
wm1_labels = get_sent_labels(wm1)
wm1_features = pickle.load(open('../features/wm1_emb.p','rb'))
print(len(wm1_features), len(wm1_labels))

wm2 = open('../../data_wm/arg_clean_45_2/train.txt','r').readlines()
wm2_labels = get_sent_labels(wm2)
wm2_features = pickle.load(open('../features/wm2_emb.p','rb'))
print(len(wm2_features), len(wm2_labels))

wm_nr = open('../../data_wm/wm_narrative/test.txt','r').readlines()
wm_nr_labels = get_sent_labels(wm_nr)

wm_nr_features = pickle.load(open('../features/narr_emb.p','rb'))
print(len(wm_nr_features), len(wm_nr_labels))

1266 1266
1862 1862
1332 1332


In [7]:
# bert embeddings only
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(wm2_features, wm2_labels)

y_pred = crf.predict(wm1_features)
y_test_flat = [y for y_seq in wm1_labels for y in y_seq]
y_pred_flat = [y for y_seq in y_pred for y in y_seq]
print(classification_report(y_test_flat, y_pred_flat, digits=3))

y_pred = crf.predict(wm_nr_features)
y_test_flat = [y for y_seq in wm_nr_labels for y in y_seq]
y_pred_flat = [y for y_seq in y_pred for y in y_seq]
print(classification_report(y_test_flat, y_pred_flat, digits=3))

In [4]:
wm1_lexsyn = pickle.load(open('../features/wm1_lexsyn.p','rb'))
wm2_lexsyn = pickle.load(open('../features/wm.p','rb'))
wm_nr_lexsyn = pickle.load(open('../features/wm_nr_emb.p','rb'))

(1862, 1862)

In [None]:
# bert embeddings only
crf = sklearn_crfsuite.CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True)
crf.fit(wm2_lexsyn, wm2_labels)

y_pred = crf.predict(wm1_lexsyn)
y_test_flat = [y for y_seq in wm1_labels for y in y_seq]
y_pred_flat = [y for y_seq in y_pred for y in y_seq]
print(classification_report(y_test_flat, y_pred_flat, digits=3))

y_pred = crf.predict(wm_nr_lexsyn)
y_test_flat = [y for y_seq in wm_nr_labels for y in y_seq]
y_pred_flat = [y for y_seq in y_pred for y in y_seq]
print(classification_report(y_test_flat, y_pred_flat, digits=3))

In [None]:
# x_lexsyn has both lexsyn and bert features after using merge
merge_features(wm_nr_features, wm1_lexsyn)
merge_features(wm2_features, wm2_lexsyn)
merge_features(wm_nr_features, wm_nr_lexsyn)