In [1]:
!pip install sklearn_crfsuite
!pip install -U 'scikit-learn<0.24'
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.8-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (965 kB)
[K     |████████████████████████████████| 965 kB 9.2 MB/s 
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.8 sklearn-crfsuite-0.3.6
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn<0.24
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 8.9 MB/s 
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sklearn_crfsuite
import pandas as pd
import numpy as np
import scipy.stats
import sklearn
import joblib
import nltk

from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import scorers

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
df_contrato_convenio = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/data_crf_training/main/contrato_convenio.csv')

In [5]:
class CRF_CrossValidation_Contrato_Convenio():

  def __init__(self):

    self.x = None
    self.y = None
    self.labels = [
        'B-NUM_AJUSTE', 
        'B-PROCESSO', 
        'B-CONTRATANTE_ou_CONCEDENTE',
        'B-CONTRATADA_ou_CONVENENTE', 
        'I-CONTRATADA_ou_CONVENENTE', 
        'B-CNPJ_CONTRATADA_ou_CONVENENTE', 
        'B-OBJ_AJUSTE', 
        'I-OBJ_AJUSTE', 
        'B-CODIGO_UO', 
        'B-NATUREZA_DESPESA', 
        'B-FONTE_RECURSO', 
        'B-NOTA_EMPENHO', 
        'B-VALOR', 
        'B-DATA_ASSINATURA', 
        'B-VIGENCIA', 
        'I-VIGENCIA', 
        'B-PROGRAMA_TRABALHO', 
        'B-NOME_RESPONSAVEL', 
        'I-NOME_RESPONSAVEL', 
        'I-CONTRATANTE_ou_CONCEDENTE', 
        'B-CNPJ_CONTRATANTE_ou_CONCEDENTE', 
        'I-PROCESSO', 
        'B-CODIGO_SIGGO', 
        'I-DATA_ASSINATURA', 
        'I-FONTE_RECURSO', 
        'I-PROGRAMA_TRABALHO', 
        'I-CNPJ_CONTRATADA_ou_CONVENENTE', 
        'I-CODIGO_UO', 
        'I-NOTA_EMPENHO', 
        'I-NUM_AJUSTE', 
        'I-NATUREZA_DESPESA', 
        'I-CODIGO_SIGGO', 
        'I-VALOR'
    ]



    self.metrics = None
    self.crf = None

  def get_features(self, sentence):
        
        sent_features = []
        for i in range(len(sentence)):
            # print(sentence[i])
            word_feat = {
                # Palavra atual
                'word': sentence[i].lower(),
                'capital_letter': sentence[i][0].isupper(),
                'all_capital': sentence[i].isupper(),
                'isdigit': sentence[i].isdigit(),
                # Uma palavra antes
                'word_before': '' if i == 0 else sentence[i-1].lower(),
                'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
                'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
                'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),
                # Uma palavra depois
                'word_after': '' if i+1 >= len(sentence) else sentence[i+1].lower(),
                'word_after_isdigit': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
                'word_after_isupper': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
                'word_after_istitle': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),

                'BOS': i == 0,
                'EOS': i == len(sentence)-1
            }
            sent_features.append(word_feat)
        return sent_features

  def load(self, data_frame):

    self.x = []
    self.y = []

    for i, row in enumerate(data_frame['treated_text']):
      self.x.append(word_tokenize(data_frame['treated_text'][i]))
      self.y.append(data_frame['IOB'][i].split())

    for i in range(len(self.x)):
        self.x[i] = self.get_features(self.x[i])


  def optimize(self):

    crf_optimizer = sklearn_crfsuite.CRF(
      algorithm='lbfgs',
      max_iterations=100,
      all_possible_transitions=True
    )

    params_space = {
      'c1': scipy.stats.expon(scale=0.1),
      'c2': scipy.stats.expon(scale=0.1),
    }
    f1_scorer = make_scorer(
        flat_f1_score,
        average='weighted', 
        labels=self.labels
    )

    rs = RandomizedSearchCV(
        crf_optimizer,
        params_space,
        cv=10,
        verbose=1,
        n_jobs=-1,
        n_iter=10,
        scoring=f1_scorer
    )

    rs.fit(self.x, self.y)
    
    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)

    res = pd.DataFrame(rs.cv_results_)
    res.to_csv('results.csv')

In [6]:
model = CRF_CrossValidation_Contrato_Convenio()
model.load(df_contrato_convenio)

In [7]:
model.optimize()

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 151.7min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 328.3min finished


best params: {'c1': 0.07181505845709163, 'c2': 0.17060997325916882}
best CV score: 0.8836245561276016
