In [None]:
!pip install sklearn_crfsuite
!pip install nltk

In [2]:
import sklearn_crfsuite
import pandas as pd
import nltk

from sklearn.model_selection import train_test_split
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

***Data From https://github.com/UnB-KnEDLe/experiments/tree/master/members/ian/atos_licitacao_contratos/Corpus3***

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!cp -r '/content/drive/MyDrive/Treinamento_CRF/' 'CRF'

In [5]:
folder_path = 'CRF/'

***Aditamento***

In [6]:
df_aditamento = pd.read_csv(folder_path + 'aditamento.csv')

***Anulação***

In [7]:
df_anulacao = pd.read_csv(folder_path + 'anulacao.csv')

***Contrato***

In [8]:
df_contrato = pd.read_csv(folder_path + 'contrato.csv')

***Licitação***

In [9]:
df_licitacao = pd.read_csv(folder_path + 'licitacao.csv')

***Suspensao***

In [10]:
df_suspensao = pd.read_csv(folder_path + 'suspensao.csv')

***Classe CRF***



In [11]:
class CRF_Flow():

  def __init__(self, tipo):

    self.tipo = tipo
    self.x = None
    self.y = None
    self.x_train = None
    self.y_train = None
    self.x_test = None
    self.y_test = None
    self.metrics = {
        'tipo': []
    }

    self.crf = sklearn_crfsuite.CRF(
      algorithm = 'lbfgs',
      c1=0.17,
      c2=0.17,
      max_iterations=50,
      all_possible_transitions=True
    )

  def get_features(self, sentence):
        """Create features for each word in act.
        Create a list of dict of words features to be used in the predictor module.
        Args:
            act (list): List of words in an act.
        Returns:
            A list with a dictionary of features for each of the words.
        """
        sent_features = []
        for i in range(len(sentence)):
            # print(sentence[i])
            word_feat = {
                # Palavra atual
                'word': sentence[i].lower(),
                'capital_letter': sentence[i][0].isupper(),
                'all_capital': sentence[i].isupper(),
                'isdigit': sentence[i].isdigit(),
                # Uma palavra antes
                'word_before': '' if i == 0 else sentence[i-1].lower(),
                'word_before_isdigit': '' if i == 0 else sentence[i-1].isdigit(),
                'word_before_isupper': '' if i == 0 else sentence[i-1].isupper(),
                'word_before_istitle': '' if i == 0 else sentence[i-1].istitle(),

                # Uma palavra depois
                'word_after': '' if i+1 >= len(sentence) else sentence[i+1].lower(),
                'word_after_isdigit': '' if i+1 >= len(sentence) else sentence[i+1].isdigit(),
                'word_after_isupper': '' if i+1 >= len(sentence) else sentence[i+1].isupper(),
                'word_after_istitle': '' if i+1 >= len(sentence) else sentence[i+1].istitle(),

                'BOS': i == 0,
                'EOS': i == len(sentence)-1
            }
            sent_features.append(word_feat)
        return sent_features

  def load(self, data_frame):
    self.x = []
    self.y = []

    for i, row in enumerate(data_frame['treated_text']):
      self.x.append(word_tokenize(data_frame['treated_text'][i]))
      self.y.append(data_frame['IOB'][i].split())

    for i in range(len(self.x)):
      self.x[i] = self.get_features(self.x[i])

    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size=0.2, random_state=42)

  def train(self):
    self.crf.fit(self.x_train, self.y_train)

  def validate(self):

    classes = list(self.crf.classes_)
    classes.remove('O')

    y_pred = self.crf.predict(self.x_test)

    self.metrics['tipo'].append(self.tipo)
                                
    for c in classes:
      self.metrics[c] = metrics.flat_f1_score(self.y_test, y_pred, average='weighted', labels=c)

  def pred(self, ato):
    ato = word_tokenize(ato)
    ato = self.get_features(ato)
    prediction = self.crf.predict([ato])
    
    print(prediction)

  def save(self):
    save_df = pd.DataFrame(self.metrics)
    save_df.to_csv(self.tipo + '_metricas.csv')


***Métricas***

In [None]:
model_aditamento = CRF_Flow('aditamento')
model_aditamento.load(df_aditamento)
model_aditamento.train()
model_aditamento.validate()
model_aditamento.save()

In [13]:
model_anulacao = CRF_Flow('anulacao')
model_anulacao.load(df_anulacao)
model_anulacao.train()
model_anulacao.validate()
model_anulacao.save()

In [None]:
model_contrato = CRF_Flow('contrato')
model_contrato.load(df_contrato)
model_contrato.train()
model_contrato.validate()
model_contrato.save()

In [15]:
model_licitacao = CRF_Flow('licitacao')
model_licitacao.load(df_licitacao)
model_licitacao.train()
model_licitacao.validate()
model_licitacao.save()

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [16]:
model_suspensao = CRF_Flow('suspensao')
model_suspensao.load(df_suspensao)
model_suspensao.train()
model_suspensao.validate()
model_suspensao.save()

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
