In [None]:
!pip install sklearn_crfsuite
!pip install -U 'scikit-learn<0.24'
!pip install nltk

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import sklearn_crfsuite
import pandas as pd
import numpy as np
import scipy.stats
import sklearn
import joblib
import nltk

from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import scorers

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
df_licitacao = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/data_crf_training/main/licitacao.csv')

In [None]:
class CRF_CrossValidation_Licitacao():

  def __init__(self):

    self.x = None
    self.y = None
    self.labels = [
        'B-NUM_LICITACAO', 
        'B-CODIGO_SISTEMA_COMPRAS', 
        'B-ORGAO_LICITANTE', 
        'I-ORGAO_LICITANTE', 
        'B-OBJ_LICITACAO', 
        'I-OBJ_LICITACAO', 
        'B-PROCESSO', 
        'B-VALOR_ESTIMADO', 
        'B-DATA_ABERTURA', 
        'B-SISTEMA_COMPRAS', 
        'B-NOME_RESPONSAVEL', 
        'I-NOME_RESPONSAVEL', 
        'B-MODALIDADE_LICITACAO', 
        'I-MODALIDADE_LICITACAO', 
        'I-SISTEMA_COMPRAS', 
        'I-DATA_ABERTURA', 
        'I-PROCESSO', 
        'B-TIPO_OBJ', 
        'I-NUM_LICITACAO', 
        'I-TIPO_OBJ', 
        'I-VALOR_ESTIMADO', 
        'I-CODIGO_SISTEMA_COMPRAS'
    ]


    self.metrics = None
    self.crf = None

  def get_features(self, sentence):
        
        sent_features = []
        for i in range(len(sentence)):
            # print(sentence[i])
            word_feat = {
                # Palavra atual
                'word': sentence[i].lower(),
                'capital_letter': sentence[i][0].isupper(),
                'all_capital': sentence[i].isupper(),
                'isdigit': sentence[i].isdigit(),
                # Uma palavra antes
                'word_before': '' if i == 0 else sentence[i-1].lower(),
                'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
                'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
                'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),
                # Uma palavra depois
                'word_after': '' if i+1 >= len(sentence) else sentence[i+1].lower(),
                'word_after_isdigit': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
                'word_after_isupper': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
                'word_after_istitle': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),

                'BOS': i == 0,
                'EOS': i == len(sentence)-1
            }
            sent_features.append(word_feat)
        return sent_features

  def load(self, data_frame):

    self.x = []
    self.y = []

    for i, row in enumerate(data_frame['treated_text']):
      self.x.append(word_tokenize(data_frame['treated_text'][i]))
      self.y.append(data_frame['IOB'][i].split())

    for i in range(len(self.x)):
        self.x[i] = self.get_features(self.x[i])


  def optimize(self):

    crf_optimizer = sklearn_crfsuite.CRF(
      algorithm='lbfgs',
      max_iterations=100,
      all_possible_transitions=True
    )

    params_space = {
      'c1': scipy.stats.expon(scale=0.1),
      'c2': scipy.stats.expon(scale=0.1),
    }
    f1_scorer = make_scorer(
        flat_f1_score,
        average='weighted', 
        labels=self.labels
    )

    rs = RandomizedSearchCV(
        crf_optimizer,
        params_space,
        cv=10,
        verbose=1,
        n_jobs=-1,
        n_iter=50,
        scoring=f1_scorer
    )

    rs.fit(self.x, self.y)
    
    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)

    res = pd.DataFrame(rs.cv_results_)
    res.to_csv('results.csv')

In [None]:
model = CRF_CrossValidation_Licitacao()
model.load(df_licitacao)

In [7]:
model.optimize()

Fitting 10 folds for each of 50 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 29.5min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 123.4min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed: 280.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 313.6min finished


best params: {'c1': 0.09966202629801069, 'c2': 0.0899064010119305}
best CV score: 0.9405374208229647
