In [None]:
from itertools import chain
from nltk.corpus.reader import ConllCorpusReader

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')


Let's use CoNLL 2003 data to build a NER system

We use English data.

Passo 1 - Treina o modelo

In [None]:
#conll2003
with open('/home/82068895153/POS/skweak/data/conll2003_dataset/train.txt', 'r') as file:
  sentences = list(file.readlines())

In [None]:
print (sentences[5])
len(sentences)

In [None]:
def preprocess(sentences):
    l_sentences = []
    l1_ = []
    for token in sentences[5:]: #a partir da quinta posicao
    #for token in sentences:
        #print('token==>', token)
        cls = token.split()    
        #print('token.split==>', cls)
        if len(cls) != 0:
            l1_.append(cls)
            #print('apos o append==>', l1_)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences

In [None]:
#Quebra a sentença em lista
sentences_1=preprocess(sentences)


In [None]:
print(sentences_1[5])

In [None]:
def word2features(sent, i):
    
    word = sent[i][0]
    #print ('word', word)
    postag = sent[i][1]
    #print ('postag', postag)

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]        
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
                
    return features


In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, __, label in sent]

def sent2tokens(sent):
    return [token for token, postag, __, label in sent]


In [None]:
X_train = [sent2features(s) for s in sentences_1]

y_train = [sent2labels(s) for s in sentences_1]


In [None]:
X_train [0]

In [None]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

Passo 2 - Prepara o Y_test a partir do dataset do Ontonotes 

In [116]:
#2.1 - Abre o Ontonotes para aplicar o tratamento 
#with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train.txt', 'r') as file:
with open('/Users/lucelia/POS/experimentos/Dataset/Ontonotes/ner_train.txt', 'r') as file:
#dataset com 1000 linhas para teste
#with open('/Users/lucelia/POS/experimentos/Dataset/Ontonotes/ner_train_lu.txt', 'r') as file:
   sentences = list(file.readlines())

In [117]:
len(sentences) 

3160787

In [118]:
#2.2 - Retira os espaços em branco e as words maiores que duas posições
for i in range(len(sentences) - 1):
    #print(sentences[116:122])
    atual = sentences[i].split()
    proximo = sentences[i+1].split()
    if len(atual) == 0:
        continue
    while len(proximo) > 2:
        #print(f'Convertendo ({atual}) e ({proximo}) para ', end = '')
        atual[0] += proximo[0]
        sentences[i] = '\t'.join(atual)+'\n'
        proximo = proximo[1:]
        sentences[i+1] = '\t'.join(proximo)+'\n'
        #print(f'({atual}) e ({proximo})')

In [119]:
len(sentences)

3160787

In [120]:
#2.3 - Verifica as linhas com mais de duas words e concatena
for i in range(len(sentences) - 1):
        atual = sentences[i].split()
        if ((len(atual)>2) and (len(atual)<=3)):
            #print('atual', atual)
            sentences[i]=(''.join(atual[0]+atual[1]))+' '+atual[2]+'\n' 
            #print('sentences', sentences[i])
            #print(i)
        elif ((len(atual)>3) and (len(atual)<=4)):
            #print('atual', atual)
            sentences[i]=(''.join(atual[0]+atual[1]+atual[2]))+' '+atual[3] +'\n'  
            #print('sentences', sentences[i])
            #print(i)
        elif (len(atual)>4):
            sentences[i]=(''.join(atual[0]+atual[1]+atual[2]+atual[3]))+' '+atual[4] +'\n' 
            #print(i)

In [53]:
print(sentences[121:150])

[]


In [121]:
len(sentences)

3160787

In [6]:
#2.4 - carrega a função para detecção da língua
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector
#na primeira execução descomentar essa linha
@Language.factory("language_detector")
def get_lang_detector(nlp, name):
   return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe('language_detector', last=True)
#print(nlp('场')._.language)

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x7fe4510365e0>

In [9]:
#2.5 - cria a função que detecta linha a linha a língua e remove tudo que não for Inglês
def limpaOntonotes(linhas):
    #count=0
    lista = []
    for line in linhas:
        line1=line.split('\t')
        if len(line1)>1:
            result = nlp(line1[0])
            #print(line1[0])
            #print(line1[1])
            result1 = nlp(line1[1])
        
            #print(result)
            if((result._.language) or (result1._.language)):
                #print(result._.language)
                result=(result._.language['language'])
                result1=(result1._.language['language'])
                #print('line', line)
                #print('result', result)
                if (result or result1) not in ['ko','zh-cn','ar','fa','zh-tw','fa', 'es','sw','id']:
                #if result not in lang:
                    #print('lista', lista) 
                    lista.append(line)
                    #print(lista)
        if (len(line1)==1):
            #print('entrou no if')
            lista.append('\n')
        #count+=1
        #print(count)    
    return lista  


In [None]:
#lang = ['af' , 'ar' , 'bg' , 'bn' , 'ca' , 'cs' , 'cy' , 'da' , 'de' , 'el' , 'es' , 'et' , 'fa' , 'fi' , 'fr' , 'gu' , 'he' , 'hi' , 'hr' , 'hu' , 'id' , 'it' , 'ja' , 'kn' , 'ko' , 'lt' , 'lv' , 'mk' , 'ml' , 'mr' , 'ne' , 'nl' , 'no' , 'pa' , 'pl' , 'pt' , 'ro' , 'ru' , 'sk' , 'sl' , ' sq' , 'sv' , 'sw' , 'ta' , 'te' , 'th' , 'tl' , 'tr' , 'uk' , 'ur' , 'vi' , 'zh-cn' , 'zh-tw']
#lang = ['ar', 'fa', 'es' ,'id',  'ko' , 'sw' , 'zh-cn', 'zh-tw']


['af'  , 'bg' , 'bn' , 'ca' , 'cs' , 'cy' , 'da' , 'de' , 'el', 'et' , 'fi' , 'fr' , 'gu' , 'he' , 'hi' , 'hr' , 'hu' , 'it' , 'ja' , 'kn' , 'lt' , 'lv' , 'mk' , 'ml' , 'mr' , 'ne' , 'nl' , 'no' , 'pa' , 'pl' , 'pt' , 'ro' , 'ru' , 'sk' , 'sl' , ' sq' , 'sv' , 'ta' , 'te' , 'th' , 'tl' , 'tr' , 'uk' , 'ur' , 'vi']

In [101]:
#lista = limpaOntonotes(['مسيرت','并不','高子平','告诉','记者','这些','收入','对','مريم','العذراء','预报','uh-huh','Good', 'school'])
#lista_1 = limpaOntonotes(['他','还','强调','现在'])
#lista_1 = limpaOntonotes(['他','还','强调','，','这些','并不','意味','着','将','有','一','场','战争'])

#2.6 Chama a função de detecção da língua
lista_1=limpaOntonotes(sentences)

KeyboardInterrupt: 

In [122]:
txt = sentences

In [125]:
len (txt)

3160787

In [104]:
print (txt[1:10])

['还有\tO\n', '那些\tO\n', '鼓噪\tO\n', '回汉冲突\tGPE\n', '的\tO\n', '人\tO\n', '是片面的\tO\n', '，\tO\n', '他们\tO\n']


In [124]:
#encoding: utf-8
#txt = ['A\tO\n', 'House\tO\n', '.\tO\n', '\n', 'Pedro\tPER\n', '.\tO\n', '\n', 'Doing\tVERB\n', 'something\tO\n', '\n']

def list_of_lists_to_list_of_strings(ls):
    return [' '.join(l) for l in ls]

def lines_to_sentences(lines):
    """
    Só funciona se último elemento de lines for '\n',
    se não for assim no seu caso adicione um no final
    caso contrário vai pular a última sentença
    """
    sentences = []
    labels = []
    sentence = []
    label = []
    for line in lines:
        if line == '\n':
            sentences.append(sentence)
            labels.append(label)
            sentence = []
            label = []
            print('entrou no for --> sentences, labels', sentences, labels)
        else:
            word, word_label = line.strip('\n').split('\t')
            sentence.append(word)
            label.append(word_label)
    
    sentences = list_of_lists_to_list_of_strings(sentences)
    labels = list_of_lists_to_list_of_strings(labels)
    return sentences, labels

#print('Sentenças e labels separados')
sentences, labels = lines_to_sentences(txt)
#print(sentences)
#print(labels)

# class LanguageDetector:
#     """Alterna a linguagem detectada só pra teste"""
#     def __init__(self):
#         self.l = True

#     def lang(self, sentence):
#         if self.l:
#             language = 'en'
            
#         else:
#             language = 'ko'
#         self.l = not self.l
#         return language

def filter_sentences(sentences, labels):
    """
    filtra pela linguagem
    """
    filtered_sentences = []
    filtered_labels = []
    #ld = LanguageDetector()

    for sentence, label in zip(sentences, labels):
        #if ld.lang(sentence) == 'en':
        result = nlp(sentence)
        
        if result._.language['language'] == 'en':
            print('sentenca', sentence, 'label',label, 'result',  result._.language['language'])
            filtered_sentences.append(sentence)
            filtered_labels.append(label)
        
    return filtered_sentences, filtered_labels


def sentences_to_lines(sentences, sentence_labels):
    """
    Contrário de lines_to_sentences
    """
    lines = []
    for sentence, labels in zip(sentences, sentence_labels):
        words = sentence.split(' ')
        word_labels = labels.split(' ')
        for word, label in zip(words, word_labels):
            # line = f'{word}\t{label}\n'
            line = word + '\t' + label + '\n'
            lines.append(line)
        lines.append('\n')
    return lines

#print('Sentenças e labels filtrados pela linguagem')
filtered_sentences, filtered_labels = filter_sentences(sentences, labels)
#print(filtered_sentences)
#print(filtered_labels)

#print('Linhas após filtragem')
filtered_lines = sentences_to_lines(filtered_sentences, filtered_labels)
#print(filtered_lines)

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
# Outra fução para detecçõ da lingua, usei a anterior. 

#detect("War doesn't show who's right, just who's left.")

def limpaOntonotes_1(linhas):
    from langdetect import detect
    lista = []
    for line in linhas[0:500]:
# the try except blook because there is some tweets contain links
        if not line: #verifica se a linha está vazia
            #print('line', line)
            result = detect(line)
            #print(result)
            if result not in lang:
                #print('lista', lista)
                lista.append(line)
            #print(lista)
        else:
            lista.append(line)
    return lista  


In [None]:
#limpaOntonotes(['他','还','强调','现在','Good Morning'])
#lista_1=limpaOntonotes_1(sentences)

In [106]:
#gravação de teste
#with open('/Users/lucelia/POS/experimentos/Dataset/Ontonotes/ner_train_lista.txt', 'wt') as fileout:
#    fileout.writelines(lista_1)

#with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_setences.txt', 'wt') as fileout:

with open('/Users/lucelia/POS/experimentos/Dataset/Ontonotes/ner_train_setences_test.txt', 'wt') as fileout:
    fileout.writelines(filtered_lines)

In [None]:
#with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_setences.txt', 'r') as file:
with open('/Users/lucelia/POS/experimentos/Dataset/Ontonotes/ner_train_setences.txt', 'r') as file:
    sentences_trat = file.readlines()

In [None]:
def troca(linhas, termo, termo_cabeca, termo_corpo):
    newlines = []
    is_in_body = False
    for line in linhas:
        if termo in line:
            if not is_in_body:
                #retira \n e insere termo depois
                word = line.split()
                newLine = word[0] + ' ' + termo_cabeca + '\n' 
                #print('PRIMEIRA OCORRENCIA')
                #print('line',line)
                #print('newLine',newLine)
            else:
                word = line.split()
                newLine = word[0] + ' ' + termo_corpo + '\n'
                #print('SEGUNDA OCORRENCIA')
                #print('line',line)
                #print('newLine',newLine)
            is_in_body = True
        else:
            is_in_body = False
            newLine = line
        newlines.append(newLine)
    return newlines


In [None]:
lines = troca(sentences_trat, 'GPE', 'B-LOC', 'I-LOC')
lines1 = troca(lines, 'PERSON', 'B-PER', 'I-PER')
lines2 = troca(lines1, 'ORG', 'B-ORG', 'I-ORG')
lines3 = troca(lines2, 'LOC', 'B-LOC', 'I-LOC')
lines4 = troca(lines3, 'PERCENT', 'MISC','MISC')
lines5 = troca(lines4, 'FAC', 'MISC','MISC')
lines6 = troca(lines5, 'CARDINAL', 'MISC','MISC') 
lines7 = troca(lines6, 'QUANTITY', 'MISC','MISC') 
lines8 = troca(lines7, 'DATE', 'MISC','MISC') 
lines9 = troca(lines8, 'EVENT', 'MISC','MISC') 
lines10 = troca(lines9, 'MONEY', 'MISC','MISC') 
lines11 = troca(lines10, 'NORP', 'MISC','MISC') 
lines12 = troca(lines11, 'PRODUCT', 'MISC','MISC') 
lines13 = troca(lines12, 'TIME', 'MISC','MISC') 
lines14 = troca(lines13, 'FAC', 'MISC','MISC') 
lines15 = troca(lines14, 'LAW', 'MISC','MISC') 


In [None]:
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_label.txt', 'wt') as fileout:
    fileout.writelines(lines15)

In [None]:
print(lines1)

In [None]:
#2.5 - Abre o Ontonotes após aplicar ajuste do label
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_label.txt', 'r') as file:
   sentences_trat_label = list(file.readlines())

In [None]:
len(sentences_trat_label)

In [None]:
print(sentences_trat_label[0:70])

In [None]:
#2.6 - Insere linha entre as sentenças
arq1 = ''
for linha in lines15:

#for linha in sentences_trat:
    p=linha.find('.')
    #print('linha', linha)
    #print ('p == ',p)
    if p==0:
        arq1=arq1+linha+'\n'
        #print ('arq de p0 == ',arq)
    else:
        arq1=arq1+linha
        #print ('arq de p = . == ',arq)

In [None]:
len(arq1)

In [None]:
#2.7 - Grava o arquivo após o tratamento
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq1.txt', 'wt') as fileout:
    fileout.writelines(sentences_trat_label)

In [None]:
#2.7 - Abre o Ontonotes após concluir o tratamento para carregar o y_test
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq1.txt', 'r') as file:
   arq_sentences = list(file.readlines())

In [None]:
len(arq_sentences)

In [None]:
arq_sentences[0:16]

In [None]:
#2.8 - cria a função para inserir tokens dentro das sentencas -- quebra a setenca em uma lista
def preprocess_b(arq_sentences):
    l_sentences = []
    l1_ = []
    for token in arq_sentences[0:]: #a partir da quinta posicao
    #for token in sentences:
        #print('token==>', token)
        cls = token.split()    
        #print('token.split==>', cls)
        if len(cls) != 0:
            l1_.append(cls)
            #print('apos o append==>', l1_)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences

In [None]:
#2.9  Aplica a função de tokens ao arquivo aberto
arq_sentences_1 = preprocess_b(sentences_trat_label)

In [None]:
print(arq_sentences_1[0:16])

In [None]:
len(arq_sentences_1)

In [None]:
#3.0 - Função para extrair os rotulos reais
def sent2labelsO(sent):
    return [label for token, label in sent]


In [None]:
#3.1 - Verifica se o dataset tem mais elementos que a chamado do métoddo sent2labels
for sentences in arq_sentences_1:
    try: 
        _ = sent2labelsO(sentences)
    except ValueError:
        for word in sentences:
            if len(word)!= 2:
                print(f' {word} possui {len(word)} elementos.')
                 
        break

In [None]:
#3.2 Extrai os rótulos reais y_test
y_test = [sent2labelsO(sentences) for sentences in arq_sentences_1]   

In [None]:
len(y_test)

Passo 3 - Carrega o X_test a partir do dataset do Ontonotes 

In [None]:
#2.1 - Abre o Ontonotes para aplicar o tratamento 
#with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train.txt', 'r') as file:
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq1.txt', 'r') as file:
   sentences_x = list(file.readlines())

In [None]:
len(sentences_x)

In [None]:
print(sentences_x[0]) 

In [None]:
#2.2 - Retira os espaços em branco e as words maiores que duas posições
for i in range(len(sentences_x) - 1):
    #print(sentences[116:122])
    atual = sentences_x[i].split()
    proximo = sentences_x[i+1].split()
    if len(atual) == 0:
        continue
    while len(proximo) > 2:
        #print(f'Convertendo ({atual}) e ({proximo}) para ', end = '')
        atual[0] += proximo[0]
        sentences_x[i] = '\t'.join(atual)+'\n'
        proximo = proximo[1:]
        sentences_x[i+1] = '\t'.join(proximo)+'\n'
        #print(f'({atual}) e ({proximo})')

In [None]:
#2.3 - Verifica as linhas com mais de duas words e concatena
for i in range(len(sentences_x) - 1):
        atual = sentences_x[i].split()
        if ((len(atual)>2) and (len(atual)<=3)):
            #print('atual', atual)
            sentences_x[i]=(''.join(atual[0]+atual[1]))+' '+atual[2]+'\n' 
            #print('sentences', sentences_x[i])
            #print(i)
        elif ((len(atual)>3) and (len(atual)<=4)):
            #print('atual', atual)
            sentences_x[i]=(''.join(atual[0]+atual[1]+atual[2]))+' '+atual[3] +'\n'  
            #print('sentences', sentences[i])
            #print(i)
        elif (len(atual)>4):
            sentences_x[i]=(''.join(atual[0]+atual[1]+atual[2]+atual[3]))+' '+atual[4] +'\n' 
            #print(i)

In [None]:
len(sentences_x)


In [None]:
#2.4 - Remove o label original
def removeLabel(linhas):
    #novoTexto = []
    newLines=[]
    for line in linhas: 
        if len(line)!=1:
            word = line.split()
            newLine = word[0]+ '\n'
            #print('entrou no if')
            #print(newLine)
            
        else:
            newLine = line 
            #print('entrou no else')
        newLines.append(newLine)
        #print(newLines)
    return newLines
     
                
           

In [None]:
texto_sem_label = removeLabel(sentences_x)

In [None]:
print(texto_sem_label)

In [None]:
len(texto)

In [None]:
#2.6 - Grava o arquivo após o tratamento
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_2.txt', 'wt') as fileout:
    fileout.writelines(texto_sem_label)

In [None]:
#2.7 - Abre o Ontonotes após concluir o tratamento para carregar o X_text
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_2.txt', 'r') as file:
   arq_sentences_2 = list(file.readlines())

In [None]:
len(arq_sentences_2)

In [None]:
#2.6 - Grava o arquivo após o tratamento
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_3.txt', 'wt') as fileout:
    fileout.write(arq_2)

In [None]:
#2.7 - Abre o Ontonotes após concluir o tratamento para carregar o X_text
with open('/home/82068895153/POS/skweak/data/Ontonotes/ner_train_arq_3.txt', 'r') as file:
   arq_sentences_3 = list(file.readlines())

In [None]:
arq_sentences_3

In [None]:
len(arq_sentences_3)

In [None]:
#2.8 - Cria os tokens dentro das sentencas -- quebra a setenca em uma lista
def preprocess_O(arq_sentences):
    l_sentences = []
    l1_ = []
    for token in arq_sentences[0:]: #a partir da quinta posicao
    #for token in sentences:
        #print('token==>', token)
        cls = token.split()    
        #print('token.split==>', cls)
        if len(cls) != 0:
            l1_.append(cls)
            #print('apos o append==>', l1_)
        else:
            l_sentences.append(l1_)
            l1_ = []
    return l_sentences

In [None]:
#2.9 - Transforma a sentença em lista
test_sentences_2=preprocess_O(arq_sentences_2)

In [None]:
print(test_sentences_2)

In [None]:
len(test_sentences_2)

In [None]:
#3.0 - Função para extrair as features do texto a ser rotulado

def word2featuresO(sent, i):
    #word = sent[i][0]
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()  
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1]       
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True
                
    return features

In [None]:
#3.1 - Função para chamada da Função para extrair as features do texto a ser rotulado
def sent2featuresO(sent):
    return [word2featuresO(sent, i) for i in range(len(sent))]

In [None]:
#3.2 - Extrai as fetatures de X_test 
X_test = [[sent2featuresO(s) for s in text] for text in test_sentences_2]

In [None]:
len(X_test)

In [None]:
X_test[0]

In [None]:
#3.2 - Retira o X_teste de dentro da lista aninhada para execução do y_pred
for i in range(len(X_test)):
        X_test[i] = [i[0] for i in X_test[i]]

4 - Passo: Gerar o y_pred = rotulos preditos para o texto não rotulado

In [None]:
#4.1 - Aplica o modelo treinado no dataset sem rotulos
y_pred = crf.predict(X_test)


In [None]:
len(y_pred)

In [None]:
#Checa se as word do Y_test com o Y_pred estão batendo
 #len(sentences_trat) , len(arq_sentences_2) 
 
for i in range(len(arq_sentences_2)):
    predicao = arq_sentences_2[i].split()
    real =  sentences_trat[i].split()
    #print('predicao',predicao[0] )
    #print('real', real[0])
    if predicao[0] != real[0]:
        print(i)
        print('real', real, i)
        print('predicao', predicao, i)

In [None]:
print ('predicao', arq_sentences_2[141346:141349].split())
print ('real' ,  sentences_trat[141346:141349].split()


In [None]:
print('word do y_test   ==>', arq_sentences_1[2000],'\n')
print('word do y_pred   ==>', test_sentences_2[2000],'\n')
print('label do y_test  ==>',y_test[2000],'\n')
print('label do y_pred  ==>',y_pred[2000],'\n')

In [None]:
count = 0
for i,x in enumerate(y_pred):
        ss=set(x)
        if len(ss) > 1:
            count+=1
print("Qtde labels preditos", count)
print("tamanho y_pred", len(y_pred))

Avaliação

There is much more O entities in data set, but we’re more interested in other entities. To account for this we’ll use averaged F1 score computed for all labels except for O. sklearn-crfsuite.metrics package provides some useful metrics for sequence classification task, including this one.

In [None]:
#PERSON == 'B-PER' 'I-PER' (CONLL), ORG == 'B-ORG', GPE == 'B-LOC' 'I-LOC' (CONLL), MISC == TUDO QUE NAO AS OUTRAS 3 NO CONLL
labels = list(crf.classes_)
labels.remove('O')
labels


In [None]:
#Imprime o tamanho do y_test e y_pred e qtde de label predita
count = 0
for i,x in enumerate(y_pred):
        ss=set(x)
        if len(ss) > 1:
            count+=1
print("Qtde labels preditos", count)
print("tamanho em sentencas y_test", len(y_test))
print("tamanho em sentencas y_pred", len(y_pred))
print('tamanho em linhas y_test  ==>', len(arq_sentences))
print('tamanho em linhas y_pred  ==>', len(arq_sentences_2))

In [None]:
metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

Inspect per-class results in more detail:

In [None]:
# group B and I results
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

Hyperparameter Optimization

To improve quality try to select regularization parameters using randomized search and 3-fold cross-validation.

I takes quite a lot of CPU time and RAM (we’re fitting a model 50 * 3 = 150 times), so grab a tea and be patient, or reduce n_iter in RandomizedSearchCV, or fit model only on a subset of training data.



In [None]:
%%time
# define fixed parameters and parameters to search
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Best result:

In [None]:
# crf = rs.best_estimator_
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
Check parameter space

A chart which shows which c1 and c2 values have RandomizedSearchCV checked. Red color means better results, blue means worse.

In [None]:
_x = [s.parameters['c1'] for s in rs.grid_scores_]
_y = [s.parameters['c2'] for s in rs.grid_scores_]
_c = [s.mean_validation_score for s in rs.grid_scores_]

fig = plt.figure()
fig.set_size_inches(12, 12)
ax = plt.gca()
ax.set_yscale('log')
ax.set_xscale('log')
ax.set_xlabel('C1')
ax.set_ylabel('C2')
ax.set_title("Randomized Hyperparameter Search CV Results (min={:0.3}, max={:0.3})".format(
    min(_c), max(_c)))

ax.scatter(_x, _y, c=_c, s=60, alpha=0.9, edgecolors=[0,0,0])

print("Dark blue => {:0.4}, dark red => {:0.4}".format(min(_c), max(_c)))

Check best estimator on our test data

As you can see, quality is improved.

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))


Let’s check what classifier learned

In [None]:
#Let’s check what classifier learned
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

We can see that, for example, it is very likely that the beginning of an organization name (B-ORG) will be followed by a token inside organization name (I-ORG), but transitions to I-ORG from tokens with other labels are penalized.

Check the state features:



In [None]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])