In [1]:
from itertools import chain
from nltk.corpus.reader import ConllCorpusReader

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')


Let's use CoNLL 2003 data to build a NER system

We use English data.

In [3]:
#conll2003
with open('/home/82068895153/POS/skweak/data/conll2003_dataset/train.txt', 'r') as file:
  sentences = list(file.readlines())

In [None]:
 print (sentences[5])

In [4]:
#cria os tokens dentro das sentencas -- quebra a setenca em uma lista
def preprocess(sentences):
    l_sentences = []
    l1_ = []
    for token in sentences[5:]: #a partir da quinta posicao
    #for token in sentences:
        cls = token.split()    
        if len(cls) != 0:
            l1_.append(cls)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences
        

In [5]:
#Quebra a sentença em lista
sentences=preprocess(sentences)

# print (sentences[5])

In [6]:
def word2features(sent, i):
    
    word = sent[i][0]
    #print ('word', word)
    postag = sent[i][1]
    #print ('postag', postag)

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


In [7]:
def word2featuresO(sent, i):
    #word = sent[i][0]
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()  
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1]       
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True
                
    return features


In [8]:
def sent2featuresO(sent):
    #print('sent ==', sent)
    #teste = [word2featuresO(sent, i) for i in range(len(sent))]
    #print('teste ==', teste)
    return [word2featuresO(sent, i) for i in range(len(sent))]
    #return teste


def sent2labelsO(sent):
    return [label for token, label in sent]

In [9]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, __, label in sent]

def sent2tokens(sent):
    return [token for token, postag, __, label in sent]


In [10]:
X_train = [sent2features(s) for s in sentences]

y_train = [sent2labels(s) for s in sentences]


In [None]:
X_train [0]

In [11]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

Carrega o X_test --> dataset do Ontonotes sem rótulos

In [12]:
#Abre o Ontonotes sem rotulos
#with open('/home/82068895153/POS/skweak/data/wiki/wikigold.conll.txt', 'r') as file:
with open('/home/82068895153/POS/skweak/data/Ontonotes/train_out_1.txt', 'r') as file:
    test_sentences = list(file.readlines())

In [None]:
print(test_sentences[0])

In [None]:
#Transforma a sentença em lista
test_sentences_1=preprocess(test_sentences)

In [None]:
print(test_sentences_1[0])

X_test = features so texto a ser rotulado

In [None]:
# Extrai as fetatures de X_test 
X_test = [[sent2featuresO(s) for s in text] for text in test_sentences_1]

#https://stackoverflow.com/questions/41829323/attributeerror-list-object-has-no-attribute-lower-gensim
#data = [line.strip() for line in open("C:\corpus\TermList.txt", 'r')]
#texts = [[word.lower() for word in text.split()] for text in data]
#X_test=test_sentences_1

In [None]:
print(len(X_test[0]))
print(X_test[0:2])

In [None]:
#X_text = [i for i in X_test_1] 
#import copy
#X_text = [copy.copy(X_test_1)]
#nova_lista is lista


In [None]:
#Retira de dentro da lista aninhada (o dataset está com uma lista a mais)
for i in range(len(X_test)):
        X_test[i] = [i[0] for i in X_test[i]]

In [None]:
X_test[0]

y_pred = rotulos preditos para o texto não rotulado

In [None]:
#Aplica o modelo treinado no dataset sem rotulos
y_pred = crf.predict(X_test)


In [None]:
y_pred 

Dataset Ontonotes com rótulos reais para gerar o y_test


In [None]:
#Abre o Ontonotes para carregar o y_test = DATASET com rótulos reais
#with open('/home/82068895153/POS/skweak/data/BERT/train_out_trat.txt', 'r') as file:
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_trat_1.txt', 'r') as file:
    test_sentences_label = list(file.readlines())
    print (test_sentences_label[0])



In [None]:
#Retira os espaços em branco e as words maiores que duas posições
for i in range(len(test_sentences_label) - 1):
    atual = test_sentences_label[i].split()
    proximo = test_sentences_label[i+1].split()
    if len(atual) == 0:
        continue
    while len(proximo) > 2:
        print(f'Convertendo ({atual}) e ({proximo}) para ', end = '')
        atual[0] += proximo[0]
        test_sentences_label[i] = '\t'.join(atual)
        proximo = proximo[1:]
        test_sentences_label[i+1] = '\t'.join(proximo)
        print(f'({atual}) e ({proximo})')

In [None]:
print (test_sentences_label[0:3])

test_sentences_label_1=preprocess(test_sentences_label)


In [None]:
print(test_sentences_label_1[0])

In [None]:
#Verifica se o dataset tem mais elementos que a chamado do métoddo sent2labels
for sentences in test_sentences_label_1:
    try: 
        _ = sent2labelsO(sentences)
    except ValueError:
        for word in sentences:
            if len(word)!= 2:
                print(f' {word} possui {len(word)} elementos.')  
        break

In [None]:
#Retira elementos de uma lista aninhada
#for i in range(len(test_sentences_label_1)):
#    test_sentences_label_1[i] = [i[0] for i in test_sentences_label_1[i]]

In [None]:
#Retira elementos de uma lista aninhada
#matrix = test_sentences_label_1
#sentencesOL = [] 
#for sublist in matrix: 
#    for val in sublist: 
#        sentencesOL.append(val) 

In [None]:
#print(sentencesOL[0]) 

Extrai os rótulos reais y_test

In [None]:
#y_test = DATASET com rótulos reais 
y_test = [sent2labelsO(s) for s in test_sentences_label_1]

    

In [None]:
y_test

In [287]:
y_test[0], y_pred[0]

(['O', 'O', 'O', 'DATE', 'DATE', 'DATE', 'DATE', 'DATE', 'B-PER', 'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])

In [288]:
count = 0
for i,x in enumerate(y_pred):
        ss=set(x)
        if len(ss) > 1:
            count+=1
print("Qtde labels preditos", count)
print("tamanho y_pred", len(y_pred))

Qtde labels preditos 155
tamanho y_pred 9891


In [None]:
##está dentro de duas uma lista = test_sentences_1 = texto sem rotulo
test_sentences_1[0]

In [None]:
##está dentro de apenas uma lista = y_pred = rotulos preditos de test_sentences_1
y_pred[0]

In [None]:
#Muda o label na lista para o formato do Ontonotes  
lista = []
for p in y_pred:
  for x in t:
    #print(x)
    if (x=='B-LOC'):
        print(x)
        lista.append('LOC')
    if (x=='B-PER'):
        print(x)
        lista.append('PER')
        print(lista) 
               

In [None]:
#Retira os espaços em branco da lista y_pred
lista_y=[]
for t in lista:
    #print('t==>',t)
    #if len(t)!=0:
     #   lista_y.append([i[0] for i in t])
        #print(lista_y)
    for x in t:
            #if (x == 'B-LOC'):
        if (x=='LOC'):
            print(x)
        elif (x == 'B-PER'):
            print(x)
                #lista_y[x] = [x[0] for x in lista_y[x]]   
                #print('x==>',x, 'lista_y',lista_y)
            #if x!=('O'): 
            #    print('x==>',x, 'lista_y',lista_y)
            
            
#for i in range(len(X_test)):
#        X_test[i] = [i[0] for i in X_test[i]]

In [None]:
#Retira os espaços em branco da lista test_sentences_1
lista=[]
for t in test_sentences_1:
    if len(t)!=0:
        lista.append([i[0] for i in t])
    #else:
    #    lista.append('\n')
#lista[0:5]

In [None]:
#setenca=(zip(t[0],t[1])) for t in zip(test_sentences_1, y_pred)
#zip(test_sentences_1[],y_pred[])
#substantivo = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
#print(substantivo)
#adjetivo = ['be', 'closed', 'for', 'the', 'rest', 'of', 'the', 'week', 'Anderson', '.']
#print(adjetivo)
#['%s %s' % (s, a) for s in substantivo for a in adjetivo ]
#['pão pequeno', 'pé pequeno', 'carro caro', 'bolo bonito', 'bolo bom']



In [None]:
 #t=list('%s %s' % (s, a) for s in lista[0] for a in y_pred[0])
 #t

In [None]:
#Concatena a palavra e o rotulo predito
l = [list(zip(x, y)) for x, y in zip(lista, lista_y)]
#print(l[0:20])

In [None]:
for t in l:
    for x in t:
        if x !=('O',):
            print(x[1:5])
            if x[1]==('B-'): 
                print(Lu)
        #    print('rotulo ==>',x[1:3], 'lista ==>',l[1:3])
            

In [None]:
#encontra o índice da lista
indices_to_replace = [i for i,x in enumerate(y_pred) if x=='B-LOC']
#print(x)
print(indices_to_replace)
#indices_to_replace
for s in indices_to_replace:
    y_pred[s] = 'LOC'
    print(y_pred[s])



In [None]:
for p in y_pred:
    for r in p:
#        #print(r)
        if (r=='LOC' or r=='PER'):
            #n=[i for i,x in enumerate(r)]
            a=[list(zip(lista, lista_y))]
            print(a)
            #Concatena a palavra e o rotulo predito

#print(l[0:20]
#transforma a lista em sentenca
texto_1 = ''
for linha in a:
    for z in linha:
        texto_1='sentenca'+ (texto_1) +(z) + '\n'
# Write the file out again
with open('/home/82068895153/POS/skweak/data/Ontonotes/train_labeled_comp.txt', 'wt') as fileout:
  fileout.write(texto_1)

In [None]:
#transforma a lista em sentenca
texto = ''
for linha in l:
    for z,t in linha:
        texto=texto + str(z)+' '+ t + '\n'

In [None]:
# Write the file out again
with open('/home/82068895153/POS/skweak/data/Ontonotes/train_labeled.txt', 'wt') as fileout:
  fileout.write(texto)

In [None]:
texto = texto.replace('I-','').replace('O-','')
texto

In [None]:
Texto = 'who was tagged for a pair of homers by Mike Devereaux and Brady Anderson and three runs in the ninth'

In [None]:
tokenized = [Texto.split()]
tokenized

In [None]:
#percorre a letra dentro da word dentro da sentenca
X_t1 = [[sent2featuresO(s) for s in text] for text in tokenized]
X_t1

In [None]:
#percorre a word dentro da sentença, mas sem criar uma nova lista
X_t1 =  [sent2featuresO(s) for s in tokenized]
X_t1

In [None]:
y_t = crf.predict(X_t1)
y_t

In [None]:
#Pega apenas a word da
for i in range(len(X_t1)):
        X_t1[i] = [i[0] for i in X_t1[i]]

Avaliação

There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [289]:
#PERSON == 'B-PER' 'I-PER' (CONLL), ORG == 'B-ORG', GPE == 'B-LOC' 'I-LOC' (CONLL), MISC == TUDO QUE NAO AS OUTRAS 3 NO CONLL
labels = list(crf.classes_)
labels.remove('O')
labels


['B-MISC', 'B-PER', 'I-PER', 'B-LOC', 'B-ORG', 'I-ORG', 'I-MISC', 'I-LOC']

In [290]:
print ('predicao', l[0] )
print ('rotulos reais', y_test[0], test_sentences_label[0])

      
    

IndexError: list index out of range

In [None]:
#Imprime os Labels do Y_pred diferentes de 'O'

#for i in range(len(y_pred)):
#    if y_pred[i]!= ('O'): 
#        print(y_pred[i])    

for linha in y_pred:        
    #print(list(linha))
    for palavra in linha:
          if palavra!= ('O'): 
                print(palavra)   

In [291]:
#Imprime o tamanho do y_test e y_pred e qtde de label predita
count = 0
for i,x in enumerate(y_pred):
        ss=set(x)
        if len(ss) > 1:
            count+=1
print("Qtde labels preditos", count)
print("tamanho y_pred", len(y_pred))
print("tamanho y_test", len(y_test))


Qtde labels preditos 155
tamanho y_pred 9891
tamanho y_test 3120


In [292]:
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

0.07118034822383563

Inspect per-class results in more detail:

In [293]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

       B-LOC      0.787     0.074     0.135      1502
       I-LOC      0.000     0.000     0.000         0
      B-MISC      0.000     0.000     0.000         0
      I-MISC      0.000     0.000     0.000         0
       B-ORG      0.000     0.000     0.000         0
       I-ORG      0.000     0.000     0.000         0
       B-PER      0.381     0.005     0.010      1571
       I-PER      0.000     0.000     0.000         0

   micro avg      0.696     0.039     0.073      3073
   macro avg      0.146     0.010     0.018      3073
weighted avg      0.580     0.039     0.071      3073



Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

I takes quite a lot of CPU time and RAM (we’re fitting a model 50 * 3 = 150 times), so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.



In [294]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



Best result:

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
Check parameter space

A chart which shows which c1 and c2 values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [None]:
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)
))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

Check best estimator on our test data

As you can see, quality is improved.

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))


Let’s check what classifier learned

In [None]:
#Let’s check what classifier learned
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized.

Check the state features:



In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

In [None]:
## carregar a base ontonotes - ok
## fazer mapeamento das labels - ok
## fazer a predição - ok
## fazer a comparação entre y_predito e y_real - ok

In [None]:

#Manipulando lista
l = [1,2,3,4,5]

#criando listas
l = [[w] for w in "We dont like it".split()]
l

def sum(i):
        return i+20
[sum(x) for x in l]

In [None]:
#str(test_sentences_label_1[0]).strip('[]')
#t=','.join(test_sentences_label_1)
cars = (['rav4'], ['td5'], ['yaris'], ['land rover tdi']) 

print("I like the "+cars[0][0]+" ...")