In [263]:
import pandas as pd
import numpy as np
import xgboost as xgb
import pickle
import json
import nltk
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, balanced_accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

import unidecode as uni
import seaborn as sns

In [247]:
%%time
df_origin = pd.read_csv(
    '../aposentadoria-ouro-comparacao/blocos_com_entides.csv',
    index_col=False
)
tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))
df.head(2)

CPU times: user 1.17 s, sys: 120 ms, total: 1.29 s
Wall time: 1.31 s


Unnamed: 0,x0,x,y0,y1,text,ents,qtd_ents
0,144.846329,190.436508,659.465149,208.861298,"ANO XLVII EDICAO No- 248\nBRASILIA - DF, TERCA...",[],0
1,56.702831,353.227753,344.850311,370.024231,"Secretaria de Estado de Trabalho, Desenvolvime...",[],0


In [257]:
%%time
df_work = df_origin.copy()
df_work.drop(df_origin.columns[:4], axis=1, inplace=True)
# df_work['ents'] = df_work.ents.map(eval)
df_work['sentencas'] = df_work.text.map(tokenizer.sentences_from_text) 
df_work = df_work.explode('sentencas')
df_work['y'] = df_work.qtd_ents > 0
df_work.head(3)

CPU times: user 30.1 s, sys: 124 ms, total: 30.2 s
Wall time: 30.2 s


Unnamed: 0,text,ents,qtd_ents,sentencas,y
0,"ANO XLVII EDICAO No- 248\nBRASILIA - DF, TERCA...",[],0,"ANO XLVII EDICAO No- 248\nBRASILIA - DF, TERCA...",False
1,"Secretaria de Estado de Trabalho, Desenvolvime...",[],0,"Secretaria de Estado de Trabalho, Desenvolvime...",False
1,"Secretaria de Estado de Trabalho, Desenvolvime...",[],0,4 16,False


In [258]:
%%time
stop_words = stopwords = nltk.corpus.stopwords.words('portuguese')
tfidf_param = {
    'lowercase': False,
    'preprocessor': uni.unidecode_expect_ascii,
    'stop_words': stop_words, 
}
xgb_param = {
    'objective': 'multi:softprob',
    'num_class': 2,
    'n_jobs': -1,
}
skf = StratifiedKFold(n_splits=5)

CPU times: user 1.02 ms, sys: 20 µs, total: 1.04 ms
Wall time: 678 µs


In [264]:
%%time
scores = {
    'acc':[],
    'acc_bal': [],
    'f1': [],
    'prfs': [],
    'test_index': [],
}
pipes = []
df = df_work[df_work.sentencas.str.len() > 40].reset_index()

for train_index, test_index in skf.split(df.text, df.y):
    pipe = Pipeline([
        ('vectorizer', TfidfVectorizer(**tfidf_param)),
        ('clf', xgb.XGBClassifier(**xgb_param))
    ])
    x_train, x_test = df.sentencas[train_index], df.sentencas[test_index]
    y_train, y_test = df.y[train_index], df.y[test_index]
    
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)

    scores['acc'].append(accuracy_score(y_test, y_pred))
    scores['f1'].append(f1_score(y_test, y_pred))
    scores['prfs'].append(precision_recall_fscore_support(y_test, y_pred))
    scores['acc_bal'].append(balanced_accuracy_score(y_test, y_pred))
    scores['test_index'].append(test_index)
del df



CPU times: user 37min 10s, sys: 24.8 s, total: 37min 34s
Wall time: 37min 35s


In [265]:
scores

{'acc': [0.9983176117047815,
  0.9970645224917911,
  0.9974125728076486,
  0.9977142459332142,
  0.9979579049961711],
 'acc_bal': [0.9653508271642778,
  0.949088844132919,
  0.948501233008125,
  0.9512000617813099,
  0.9528131141254352],
 'f1': [0.9456317960254969,
  0.9059129788025287,
  0.9159442140972484,
  0.9254070427868232,
  0.9330798479087453],
 'prfs': [(array([0.99890426, 0.96039604]),
   array([0.99938703, 0.93131462]),
   array([0.99914559, 0.9456318 ]),
   array([84833,  1354])),
  (array([0.99839721, 0.91235955]),
   array([0.99862082, 0.89955687]),
   array([0.998509  , 0.90591298]),
   array([84833,  1354])),
  (array([0.99837429, 0.93461538]),
   array([0.99899803, 0.89800443]),
   array([0.99868606, 0.91594421]),
   array([84833,  1353])),
  (array([0.99845697, 0.94875776]),
   array([0.999222  , 0.90317812]),
   array([0.99883934, 0.92540704]),
   array([84833,  1353])),
  (array([0.9985043 , 0.96159875]),
   array([0.99942239, 0.90620384]),
   array([0.99896313, 0.9