In [None]:
from itertools import chain
from nltk.corpus.reader import ConllCorpusReader

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')


Let's use CoNLL 2003 data to build a NER system

We use English data.

Passo 1 - Treina o modelo

In [None]:
#conll2003
with open('/home/82068895153/POS/skweak/data/conll2003_dataset/train.txt', 'r') as file:
  sentences = list(file.readlines())

In [None]:
 print (sentences[5])
 len(sentences)

In [None]:
def preprocess(sentences):
    l_sentences = []
    l1_ = []
    for token in sentences[5:]: #a partir da quinta posicao
    #for token in sentences:
        #print('token==>', token)
        cls = token.split()    
        #print('token.split==>', cls)
        if len(cls) != 0:
            l1_.append(cls)
            #print('apos o append==>', l1_)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences

In [None]:
#Quebra a sentença em lista
sentences_1=preprocess(sentences)


In [None]:
print(sentences_1[5])

In [None]:
def word2features(sent, i):
    
    word = sent[i][0]
    #print ('word', word)
    postag = sent[i][1]
    #print ('postag', postag)

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, __, label in sent]

def sent2tokens(sent):
    return [token for token, postag, __, label in sent]


In [None]:
X_train = [sent2features(s) for s in sentences_1]

y_train = [sent2labels(s) for s in sentences_1]


In [None]:
X_train [0]

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Passo 2 - Prepara o Y_test a partir do dataset do Ontonotes 

In [None]:
#2.1 - Abre o Ontonotes para aplicar o tratamento 
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train.txt', 'r') as file:
   sentences = list(file.readlines())

In [None]:
len(sentences)

In [None]:
#2.2 - Retira os espaços em branco e as words maiores que duas posições
for i in range(len(sentences) - 1):
    #print(sentences[116:122])
    atual = sentences[i].split()
    proximo = sentences[i+1].split()
    if len(atual) == 0:
        continue
    while len(proximo) > 2:
        #print(f'Convertendo ({atual}) e ({proximo}) para ', end = '')
        atual[0] += proximo[0]
        sentences[i] = '\t'.join(atual)+'\n'
        proximo = proximo[1:]
        sentences[i+1] = '\t'.join(proximo)+'\n'
        #print(f'({atual}) e ({proximo})')

In [None]:
print(sentences[116:150])

In [None]:
#2.3 - Verifica as linhas com mais de duas words e concatena
for i in range(len(sentences) - 1):
        atual = sentences[i].split()
        if ((len(atual)>2) and (len(atual)<=3)):
            #print('atual', atual)
            sentences[i]=(''.join(atual[0]+atual[1]))+' '+atual[2]+'\n' 
            #print('sentences', sentences[i])
            #print(i)
        elif ((len(atual)>3) and (len(atual)<=4)):
            #print('atual', atual)
            sentences[i]=(''.join(atual[0]+atual[1]+atual[2]))+' '+atual[3] +'\n'  
            #print('sentences', sentences[i])
            #print(i)
        elif (len(atual)>4):
            sentences[i]=(''.join(atual[0]+atual[1]+atual[2]+atual[3]))+' '+atual[4] +'\n' 
            #print(i)

In [None]:
print(sentences[116:150])

In [None]:
len(sentences)

In [None]:
#2.4 - Faz a troca dos labels
def processarLinha(sentences):
    #print(sentences)
    #print(sentences[1])
    #count = 0
    # Write the file out again
    with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_label.txt', 'wt') as fileout:
        for linha in sentences:
            #count = count +1
            #print (linha)
            lista = linha.split("\t")
            
            #print (len(lista))
            #print (lista)     
            if len(lista)==2 and lista[1] == 'PERSON\n':
                lista[1]= 'B-PER\n'
                #print (lista)
                fileout.write(lista[0]+'\t'+lista[1])
            elif len(lista)==2 and lista[1] == 'GPE\n': 
                lista[1]= 'B-LOC\n'
                #print (lista)
                fileout.write(lista[0]+'\t'+lista[1])         
            elif len(lista)==2:
                fileout.write(lista[0]+'\t'+lista[1])
            #if count == 15:
              #  break



In [None]:
processarLinha(sentences)

In [None]:
#2.5 - Abre o Ontonotes após aplicar ajuste do label
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_label.txt', 'r') as file:
   sentences_trat = list(file.readlines())

In [None]:
len(sentences_trat)

In [None]:
#2.6 - Insere linha entre as sentenças
arq1 = ''
for linha in sentences_trat:
    p=linha.find('.')
    #print('linha', linha)
    #print ('p == ',p)
    if p==0:
        arq1=arq1+linha+'\n'
        #print ('arq de p0 == ',arq)
    else:
        arq1=arq1+linha
        #print ('arq de p = . == ',arq)

In [None]:
len(arq1)

In [None]:
#2.7 - Grava o arquivo após o tratamento
#with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq1.txt', 'wt') as fileout:
#    fileout.write(arq1)

In [None]:
#2.7 - Abre o Ontonotes após concluir o tratamento para carregar o y_test
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq1.txt', 'r') as file:
   arq_sentences = list(file.readlines())

In [None]:
len(arq_sentences)

In [None]:
arq_sentences[0:16]

In [None]:
#2.8 - cria a função para inserir tokens dentro das sentencas -- quebra a setenca em uma lista
def preprocess_b(arq_sentences):
    l_sentences = []
    l1_ = []
    for token in arq_sentences[0:]: #a partir da quinta posicao
    #for token in sentences:
        #print('token==>', token)
        cls = token.split()    
        #print('token.split==>', cls)
        if len(cls) != 0:
            l1_.append(cls)
            #print('apos o append==>', l1_)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences

In [None]:
#2.9  Aplica a função de tokens ao arquivo aberto
arq_sentences_1 = preprocess_b(arq_sentences)

In [None]:
print(arq_sentences_1[0:2])

In [None]:
len(arq_sentences_1)

In [None]:
#3.0 - Função para extrair os rotulos reais
def sent2labelsO(sent):
    return [label for token, label in sent]


In [None]:
#3.1 - Verifica se o dataset tem mais elementos que a chamado do métoddo sent2labels
for sentences in arq_sentences_1:
    try: 
        _ = sent2labelsO(sentences)
    except ValueError:
        for word in sentences:
            if len(word)!= 2:
                print(f' {word} possui {len(word)} elementos.')
                 
        break

In [None]:
#3.2 Extrai os rótulos reais y_test
y_test = [sent2labelsO(sentences) for sentences in arq_sentences_1]   

In [None]:
len(y_test)

Passo 3 - Carrega o X_test a partir do dataset do Ontonotes 

In [None]:
#2.1 - Abre o Ontonotes para aplicar o tratamento 
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train.txt', 'r') as file:
   sentences_x = list(file.readlines())

In [None]:
len(sentences_x)

In [None]:
print(sentences_x[0]) 

In [None]:
#2.2 - Retira os espaços em branco e as words maiores que duas posições
for i in range(len(sentences_x) - 1):
    #print(sentences[116:122])
    atual = sentences_x[i].split()
    proximo = sentences_x[i+1].split()
    if len(atual) == 0:
        continue
    while len(proximo) > 2:
        #print(f'Convertendo ({atual}) e ({proximo}) para ', end = '')
        atual[0] += proximo[0]
        sentences_x[i] = '\t'.join(atual)+'\n'
        proximo = proximo[1:]
        sentences_x[i+1] = '\t'.join(proximo)+'\n'
        #print(f'({atual}) e ({proximo})')

In [None]:
#2.3 - Verifica as linhas com mais de duas words e concatena
for i in range(len(sentences_x) - 1):
        atual = sentences_x[i].split()
        if ((len(atual)>2) and (len(atual)<=3)):
            #print('atual', atual)
            sentences_x[i]=(''.join(atual[0]+atual[1]))+' '+atual[2]+'\n' 
            #print('sentences', sentences_x[i])
            #print(i)
        elif ((len(atual)>3) and (len(atual)<=4)):
            #print('atual', atual)
            sentences_x[i]=(''.join(atual[0]+atual[1]+atual[2]))+' '+atual[3] +'\n'  
            #print('sentences', sentences[i])
            #print(i)
        elif (len(atual)>4):
            sentences_x[i]=(''.join(atual[0]+atual[1]+atual[2]+atual[3]))+' '+atual[4] +'\n' 
            #print(i)

In [None]:
len(sentences_x)


In [None]:
#2.4 - Remove o label original
texto=''
#count=0
for linha in sentences_x:
    if len(linha) != 1:
        x=linha.split()[0]              
        #print('x-->', x)
        texto=texto+x+'\n'
        continue
                
                #verifica se a linha tem mais de 2 palavras
                # if len(x) > 2:
                #     #print('split', x)
                #     print(f' {x} possui {len(x)} elementos.') 
                # if len(x) == 1:
                #     #print('split', x)
                #     print(f' {x} possui {len(x)} elementos.')    

In [None]:
print(texto[0:120])

In [None]:
len(texto)

In [None]:
#2.6 - Grava o arquivo após o tratamento
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_2.txt', 'wt') as fileout:
    fileout.write(texto)

In [None]:
#2.7 - Abre o Ontonotes após concluir o tratamento para carregar o X_text
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_2.txt', 'r') as file:
   arq_sentences_2 = list(file.readlines())

In [None]:
len(arq_sentences_2)

In [None]:
#2.5 - Insere linha entre as sentenças
arq_2 = ''
for linha in arq_sentences_2:
    p=linha.find('.')
    #print('linha', linha)
    #print ('p == ',p)
    if p==0:
        arq_2=arq_2+linha+'\n'
        #print ('arq de p0 == ',arq)
    else:
        arq_2=arq_2+linha
        #print ('arq de p = . == ',arq)

In [None]:
#2.6 - Grava o arquivo após o tratamento
#with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_3.txt', 'wt') as fileout:
#    fileout.write(arq_2)

In [None]:
#2.7 - Abre o Ontonotes após concluir o tratamento para carregar o X_text
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_3.txt', 'r') as file:
   arq_sentences_3 = list(file.readlines())

In [None]:
arq_sentences_3

In [21]:
#2.8 - Cria os tokens dentro das sentencas -- quebra a setenca em uma lista
def preprocess_O(arq_sentences_3):
    l_sentences = []
    l1_ = []
    for token in arq_sentences_3[0:]: #a partir da quinta posicao
    #for token in sentences:
        #print('token==>', token)
        cls = token.split()    
        #print('token.split==>', cls)
        if len(cls) != 0:
            l1_.append(cls)
            #print('apos o append==>', l1_)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences

In [22]:
#2.9 - Transforma a sentença em lista
test_sentences_2=preprocess_O(arq_sentences_3)

In [None]:
print(test_sentences_2)

In [None]:
len(test_sentences_2)

In [23]:
#3.0 - Função para extrair as features do texto a ser rotulado

def word2featuresO(sent, i):
    #word = sent[i][0]
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()  
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1]       
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True
                
    return features

In [24]:
#3.1 - Função para chamada da Função para extrair as features do texto a ser rotulado
def sent2featuresO(sent):
    return [word2featuresO(sent, i) for i in range(len(sent))]

In [25]:
#3.2 - Extrai as fetatures de X_test 
X_test = [[sent2featuresO(s) for s in text] for text in test_sentences_2]

In [None]:
len(X_test)

In [26]:
X_test[0]

[[{'bias': 1.0,
   'word.lower()': 'the',
   'word[-3:]': 'The',
   'word[-2:]': 'he',
   'word.isupper()': False,
   'word.istitle()': True,
   'word.isdigit()': False,
   'BOS': True,
   'EOS': True}],
 [{'bias': 1.0,
   'word.lower()': 'school',
   'word[-3:]': 'ool',
   'word[-2:]': 'ol',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'BOS': True,
   'EOS': True}],
 [{'bias': 1.0,
   'word.lower()': 'is',
   'word[-3:]': 'is',
   'word[-2:]': 'is',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'BOS': True,
   'EOS': True}],
 [{'bias': 1.0,
   'word.lower()': 'going',
   'word[-3:]': 'ing',
   'word[-2:]': 'ng',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'BOS': True,
   'EOS': True}],
 [{'bias': 1.0,
   'word.lower()': 'to',
   'word[-3:]': 'to',
   'word[-2:]': 'to',
   'word.isupper()': False,
   'word.istitle()': False,
   'word.isdigit()': False,
   'BOS':

In [27]:
#3.2 - Retira o X_teste de dentro da lista aninhada para execução do y_pred
for i in range(len(X_test)):
        X_test[i] = [i[0] for i in X_test[i]]

4 - Passo: Gerar o y_pred = rotulos preditos para o texto não rotulado

In [28]:
#4.1 - Aplica o modelo treinado no dataset sem rotulos
y_pred = crf.predict(X_test)


In [29]:
len(y_pred)

98807

In [30]:
#Checa se as word do Y_test com o Y_pred estão batendo
 #len(sentences_trat) , len(arq_sentences_2) 
 
for i in range(len(arq_sentences_2)):
    predicao = arq_sentences_2[i].split()
    real =  sentences_trat[i].split()
    #print('predicao',predicao[0] )
    #print('real', real[0])
    if predicao[0] != real[0]:
        print(i)
        print('real', real, count)
        print('predicao', predicao, count)

NameError: name 'arq_sentences_2' is not defined

In [None]:
print ('predicao', arq_sentences_2[141346:141349].split())
print ('real' ,  sentences_trat[141346:141349].split()


In [31]:
print('word do y_test   ==>', arq_sentences_1[140],'\n')
print('word do y_pred   ==>', test_sentences_2[140],'\n')
print('label do y_test  ==>',y_test[140],'\n')
print('label do y_pred  ==>',y_pred[140],'\n')

word do y_test   ==> [['angstrom', 'B-PER'], ['Larry', 'B-PER'], ['Drury', 'B-PER'], [',', 'O'], ['attorney', 'O'], ['for', 'O'], ['the', 'O'], ['plaintiffs', 'O'], [',', 'O'], ['valued', 'O'], ['the', 'O'], ['settlement', 'O'], ['at', 'O'], ['between', 'MONEY'], ['$', 'MONEY'], ['6', 'MONEY'], ['million', 'MONEY'], ['and', 'MONEY'], ['$', 'MONEY'], ['8', 'MONEY'], ['million', 'MONEY'], ['.', 'O']] 

word do y_pred   ==> [['angstrom'], ['Larry'], ['Drury'], [','], ['attorney'], ['for'], ['the'], ['plaintiffs'], [','], ['valued'], ['the'], ['settlement'], ['at'], ['between'], ['$'], ['6'], ['million'], ['and'], ['$'], ['8'], ['million'], ['.']] 

label do y_test  ==> ['B-PER', 'B-PER', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'MONEY', 'MONEY', 'MONEY', 'MONEY', 'MONEY', 'MONEY', 'MONEY', 'MONEY', 'O'] 

label do y_pred  ==> ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 



In [32]:
count = 0
for i,x in enumerate(y_pred):
        ss=set(x)
        if len(ss) > 1:
            count+=1
print("Qtde labels preditos", count)
print("tamanho y_pred", len(y_pred))

Qtde labels preditos 4238
tamanho y_pred 98807


Avaliação

There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [33]:
#PERSON == 'B-PER' 'I-PER' (CONLL), ORG == 'B-ORG', GPE == 'B-LOC' 'I-LOC' (CONLL), MISC == TUDO QUE NAO AS OUTRAS 3 NO CONLL
labels = list(crf.classes_)
labels.remove('O')
labels


['B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'B-ORG', 'I-ORG', 'I-MISC', 'I-LOC']

In [34]:
#Imprime o tamanho do y_test e y_pred e qtde de label predita
count = 0
for i,x in enumerate(y_pred):
        ss=set(x)
        if len(ss) > 1:
            count+=1
print("Qtde labels preditos", count)
print("tamanho em sentencas y_test", len(y_test))
print("tamanho em sentencas y_pred", len(y_pred))
print('tamanho em linhas y_test  ==>', len(arq_sentences))
print('tamanho em linhas y_pred  ==>', len(arq_sentences_3))

Qtde labels preditos 4238
tamanho em sentencas y_test 98807
tamanho em sentencas y_pred 98807
tamanho em linhas y_test  ==> 3103448
tamanho em linhas y_pred  ==> 3103448


In [35]:
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.06900161764121217

Inspect per-class results in more detail:

In [36]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.817     0.071     0.130     46165
       I-LOC      0.000     0.000     0.000         0
      B-MISC      0.000     0.000     0.000         0
      I-MISC      0.000     0.000     0.000         0
       B-ORG      0.000     0.000     0.000         0
       I-ORG      0.000     0.000     0.000         0
       B-PER      0.517     0.006     0.012     49382
       I-PER      0.000     0.000     0.000         0

   micro avg      0.754     0.037     0.071     95547
   macro avg      0.167     0.010     0.018     95547
weighted avg      0.662     0.037     0.069     95547



Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

I takes quite a lot of CPU time and RAM (we’re fitting a model 50 * 3 = 150 times), so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.



In [None]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Best result:

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
Check parameter space

A chart which shows which c1 and c2 values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [None]:
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

Check best estimator on our test data

As you can see, quality is improved.

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))


Let’s check what classifier learned

In [None]:
#Let’s check what classifier learned
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized.

Check the state features:



In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])