In [None]:
!pip install sklearn_crfsuite
!pip install -U 'scikit-learn<0.24'
!pip install nltk

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
import sklearn_crfsuite
import pandas as pd
import numpy as np
import scipy.stats
import sklearn
import joblib
import nltk

from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
from sklearn.metrics import classification_report, make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn_crfsuite import scorers

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

***Data From https://github.com/UnB-KnEDLe/experiments/tree/master/members/ian/atos_licitacao_contratos/Corpus3***

***Aditamento***

In [None]:
# df_aditamento = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/hyperparameter_test/main/aditamento.csv')

***Anulação***

In [6]:
df_anulacao = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/data_crf_training/main/anulacao_revogacao.csv')

***Contrato***

In [None]:
# df_contrato = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/hyperparameter_test/main/contrato.csv')

***Licitação***

In [None]:
# df_licitacao = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/hyperparameter_test/main/licitacao.csv')

***Suspensao***

In [None]:
# df_suspensao = pd.read_csv('https://raw.githubusercontent.com/brunoedcf/hyperparameter_test/main/suspensao.csv')

***Classe CRF***



In [14]:
class CRF_Flow():

  def __init__(self, tipo):

    self.tipo = tipo
    self.x = None
    self.y = None
    self.x_train = None
    self.y_train = None
    self.x_test = None
    self.y_test = None
    self.x_validation = None
    self.y_validation = None
    self.labels = None
    self.metrics = None


    self.crf = sklearn_crfsuite.CRF(
      algorithm = 'lbfgs',
      c1=0.17,
      c2=0.17,
      max_iterations=70,
      all_possible_transitions=True
    )

  def get_features(self, sentence):
        
        sent_features = []
        for i in range(len(sentence)):
            # print(sentence[i])
            word_feat = {
                # Palavra atual
                'word': sentence[i].lower(),
                'capital_letter': sentence[i][0].isupper(),
                'all_capital': sentence[i].isupper(),
                'isdigit': sentence[i].isdigit(),
                # Uma palavra antes
                'word_before': '' if i == 0 else sentence[i-1].lower(),
                'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
                'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
                'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),
                # Uma palavra depois
                'word_after': '' if i+1 >= len(sentence) else sentence[i+1].lower(),
                'word_after_isdigit': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
                'word_after_isupper': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
                'word_after_istitle': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),

                'BOS': i == 0,
                'EOS': i == len(sentence)-1
            }
            sent_features.append(word_feat)
        return sent_features

  def load(self, data_frame):
    if self.tipo != 'anulacao' and self.tipo != 'suspensao':

      self.x = []
      self.y = []

      for i, row in enumerate(data_frame['treated_text']):
        self.x.append(word_tokenize(data_frame['treated_text'][i]))
        self.y.append(data_frame['IOB'][i].split())

      for i in range(len(self.x)):
        self.x[i] = self.get_features(self.x[i])

      self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=0.4, random_state=42)
      self.x_test, self.x_validation, self.y_test, self.y_validation = train_test_split(self.x_test, self.y_test, test_size=0.3, random_state=42)

    elif self.tipo == 'anulacao':

      self.x = []
      self.y = []
      
      for i, row in enumerate(data_frame['treated_text']):
        self.x.append(word_tokenize(data_frame['treated_text'][i]))
        self.y.append(data_frame['IOB'][i].split())

      for i in range(len(self.x)):
        self.x[i] = self.get_features(self.x[i])

      DATA_ESCRITO = []

      for i in range(len(data_frame)):
        if str(data_frame['DATA_ESCRITO'][i]) != "nan":
          DATA_ESCRITO.append(i)
      
      DATA_ESCRITO_1 = DATA_ESCRITO[:(len(DATA_ESCRITO)*70)//100]
      DATA_ESCRITO_2 = DATA_ESCRITO[(len(DATA_ESCRITO)*70)//100:]

      lic = []
      lic.extend(DATA_ESCRITO_1)
      lic = sorted(list(set(lic)))

      not_lic = []
      not_lic.extend(DATA_ESCRITO_2)
      not_lic = sorted(list(set(not_lic)))

      for i in lic:
        for j in not_lic:
          if i == j:
            not_lic.pop(not_lic.index(j))

      df_lic = data_frame.iloc[lic]
      df_not_lic = data_frame.iloc[not_lic]

      df_real = data_frame.drop(df_lic.index)
      df_real = df_real.drop(df_not_lic.index)

      train = pd.concat([df_lic, df_real[:int(len(data_frame)*0.7) - len(df_lic)]])
      test = pd.concat([df_real[int(len(data_frame)*0.7) - len(df_lic):], df_not_lic])

      self.x_train = []
      self.y_train = []
      self.x_test = []
      self.y_test = []

      tt_test = test['treated_text']
      iob_test = test['IOB']
      
      for i, row in enumerate(train['treated_text']):
        self.x_train.append(word_tokenize(row))

      for i in range(len(self.x_train)):
        self.x_train[i] = self.get_features(self.x_train[i])

      for i, row in enumerate(train['IOB']):
        self.y_train.append(row.split())

      for i, row in enumerate(test['treated_text']):
        self.x_test.append(word_tokenize(row))

      for i in range(len(self.x_test)):
        self.x_test[i] = self.get_features(self.x_test[i])

      for i, row in enumerate(test['IOB']):
        self.y_test.append(row.split())

    elif self.tipo == 'suspensao':

      self.x = []
      self.y = []

      DECISAO_TCDF = []

      for i, row in enumerate(data_frame['treated_text']):
        self.x.append(word_tokenize(data_frame['treated_text'][i]))
        self.y.append(data_frame['IOB'][i].split())

      for i in range(len(self.x)):
        self.x[i] = self.get_features(self.x[i])

      for i in range(len(data_frame)):
        if str(data_frame['DECISAO_TCDF'][i]) != "nan":
          DECISAO_TCDF.append(i)
      
      DECISAO_TCDF_1 = DECISAO_TCDF[:(len(DECISAO_TCDF)*70)//100]
      DECISAO_TCDF_2 = DECISAO_TCDF[(len(DECISAO_TCDF)*70)//100:]

      lic = []
      lic.extend(DECISAO_TCDF_1)
      lic = sorted(list(set(lic)))

      not_lic = []
      not_lic.extend(DECISAO_TCDF_2)
      not_lic = sorted(list(set(not_lic)))

      for i in lic:
        for j in not_lic:
          if i == j:
            not_lic.pop(not_lic.index(j))

      df_lic = data_frame.iloc[lic]
      df_not_lic = data_frame.iloc[not_lic]

      df_real = data_frame.drop(df_lic.index)
      df_real = df_real.drop(df_not_lic.index)

      train = pd.concat([df_lic, df_real[:int(len(data_frame)*0.7) - len(df_lic)]])
      test = pd.concat([df_real[int(len(data_frame)*0.7) - len(df_lic):], df_not_lic])

      self.x_train = []
      self.y_train = []
      self.x_test = []
      self.y_test = []

      tt_test = test['treated_text']
      iob_test = test['IOB']
      
      for i, row in enumerate(train['treated_text']):
        self.x_train.append(word_tokenize(row))

      for i in range(len(self.x_train)):
        self.x_train[i] = self.get_features(self.x_train[i])

      for i, row in enumerate(train['IOB']):
        self.y_train.append(row.split())

      for i, row in enumerate(test['treated_text']):
        self.x_test.append(word_tokenize(row))

      for i in range(len(self.x_test)):
        self.x_test[i] = self.get_features(self.x_test[i])

      for i, row in enumerate(test['IOB']):
        self.y_test.append(row.split())

    else:
      print("Tipo Invalido")


  def train(self):

    if self.tipo != 'anulacao' and self.tipo != 'suspensao':
      self.crf.fit(self.x_train, self.y_train, X_dev=self.x_validation, y_dev=self.y_validation)
    else:
      self.crf.fit(self.x_train, self.y_train)

    classes = list(self.crf.classes_)
    classes.remove('O')
    self.labels = classes

  def optimize(self):

    crf_optimizer = sklearn_crfsuite.CRF(
      algorithm='lbfgs',
      max_iterations=70,
      all_possible_transitions=True
    )

    params_space = {
      'c1': [0.231363],
      'c2': [0.001591],
      # 'c1': scipy.stats.expon(scale=0.1),
      # 'c2': scipy.stats.expon(scale=0.1),
    }
    f1_scorer = make_scorer(
        flat_f1_score,
        average='weighted', 
        labels=self.labels
    )

    rs = RandomizedSearchCV(
        crf_optimizer,
        params_space,
        cv=10,
        verbose=1,
        n_jobs=-1,
        n_iter=50,
        scoring=f1_scorer
    )

    rs.fit(self.x_train, self.y_train)
    
    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)

    self.crf = rs.best_estimator_
    joblib.dump(self.crf, f"{self.tipo}.pkl")


  def validate(self):

    y_pred = self.crf.predict(self.x_test)

    self.metrics = flat_classification_report(self.y_test, y_pred, labels=self.labels, digits=3)
    print(self.metrics)


  def save(self):
    with open('metrics.txt', 'w') as file:
      file.write(self.metrics)


***Métricas***

In [15]:
model_anulacao = CRF_Flow('anulacao')
model_anulacao.load(df_anulacao)
model_anulacao.train()
# model_anulacao.validate()
model_anulacao.optimize()
model_anulacao.validate()
# model_anulacao.save()

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    8.8s finished


best params: {'c2': 0.001591, 'c1': 0.231363}
best CV score: 0.5863195742544856
                            precision    recall  f1-score   support

    B-MODALIDADE_LICITACAO      0.667     0.769     0.714        13
           B-NUM_LICITACAO      1.000     0.444     0.615        18
           I-NUM_LICITACAO      0.000     0.000     0.000         0
B-IDENTIFICACAO_OCORRENCIA      0.929     0.684     0.788        19
                B-PROCESSO      0.750     0.545     0.632        11
            B-DATA_ESCRITO      1.000     0.800     0.889         5
            I-DATA_ESCRITO      1.000     1.000     1.000        16
        B-NOME_RESPONSAVEL      1.000     1.000     1.000        13
        I-NOME_RESPONSAVEL      0.925     1.000     0.961        37
    I-MODALIDADE_LICITACAO      0.643     0.818     0.720        11
         B-ORGAO_LICITANTE      1.000     0.429     0.600        14
         I-ORGAO_LICITANTE      0.654     0.425     0.515        40
                I-PROCESSO      0.0

In [None]:
# model_aditamento = CRF_Flow('aditamento')
# model_aditamento.load(df_aditamento)
# model_aditamento.train()
# model_aditamento.validate()
# model_aditamento.optimize()
# model_aditamento.validate()
# model_aditamento.save()

In [None]:
# model_contrato = CRF_Flow('contrato')
# model_contrato.load(df_contrato)
# model_contrato.train()
# model_contrato.validate()
# model_contrato.optimize()
# model_contrato.validate()
# model_contrato.save()

In [None]:
# model_licitacao = CRF_Flow('licitacao')
# model_licitacao.load(df_licitacao)
# model_licitacao.train()
# model_licitacao.validate()
# model_licitacao.optimize()
# model_licitacao.validate()
# model_licitacao.save()

In [None]:
# model_suspensao = CRF_Flow('suspensao')
# model_suspensao.load(df_suspensao)
# model_suspensao.train()
# model_suspensao.validate()
# model_suspensao.optimize()
# model_suspensao.validate()
# model_suspensao.save()