In [10]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from sklearn.base import BaseEstimator, TransformerMixin
import glob
import re

# Dataframe

# Preprocess

In [13]:
class Tokenizer(TransformerMixin, BaseEstimator):
    def __init__(self, tokenizer=''):
        self.tokenizer = tokenizer


    def __call__(self, X, **kw_params):
        return self.tokenizer(X, **kw_params)


    def fit(self, X, y=None, **fit_params):
        return self


    def transform(self, X, **kw_params):
        if not isinstance(X, pd.Series):
            print("[preprocess.Tokenizer.transform] TYPE:", type(X))
            print('X:::: ', X)
            X = pd.Series(X)
        return X.map(self)


from sklearn.base import BaseEstimator, TransformerMixin

class IOBifyer(TransformerMixin, BaseEstimator):

    @staticmethod
    def find_entity(row, token, ignore_idx=0,
        tokenizer=''):
        # TODO: aceitar opção de offset, para não ter tennhum tipo de problema
        for idx, column in enumerate(row.keys()):
            if idx == ignore_idx:
                continue
            if isinstance(row[column], str) and \
                token == word_tokenize(row[column])[0]:
                return column

        return None


    @staticmethod
    def generate_IOB_labels(row, idx, tokenizer, dbg={}):
        labels = []
        entity_started = False
        text = row.iloc[idx]
        for token in word_tokenize(text):                         
            if not entity_started:                               
                entity = IOBifyer.find_entity(row, token, idx)                 
                if entity is not None:                           
                    entity_started = True
                    token_index = 1
                    labels.append('B-' + entity)
                else:
                    labels.append('O')
            else:
                if token_index < len(word_tokenize(row[entity])) and \
                    token == word_tokenize(row[entity])[token_index]:
                    labels.append('I-' + entity)
                    token_index += 1
                    if token_index >= len(word_tokenize(row[entity])):
                        entity_started = False
                else:
                    entity_started = False
                    labels.append('O')
        if labels[0] != 'O':
            dbg['l'] = dbg.get('l', []) + [(row, idx)]

        return labels


    @staticmethod
    def dump_iob(tokens_mat, labels_mat, path='dump.txt',
                            sep=' X X ', sent_sep='\n',):
        dbg_mat = []
        if isinstance(path, Path):
            path = path.as_posix()
        if '/' in path:
            os.makedirs('/'.join(path.split('/')[:-1]), exist_ok=True)

        with open(path, 'w') as fp:
            for tokens_lis, labels_lis in zip(tokens_mat, labels_mat):
                dbg_mat.append([])
                for token, label in zip(tokens_lis, labels_lis):
                    dbg_mat[-1].append((token, label))
                    fp.write(f"{token}{sep}{label}\n")
                fp.write(sent_sep)
        return dbg_mat


    def __init__(self, column='act_column',
        tokenizer=''):
        self.column = column
        self.tokenizer = tokenizer
        self.dbg = {}


    def fit(self, X=None, y=None, **fit_params):
        return self


    def transform(self, df):
        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"`df` expected to be a pd.DataFrame. Got {type(df)}")
        if df.empty:
            print("[core.preprocess]Warning: empty DataFrame. There won't be ioblabels.")
            return pd.Series()

        idx = self.column if isinstance(self.column, int) else  \
                df.columns.get_loc(self.column)
        labels_row = []
        for index, row in df.iterrows():
            try:
                labels_row.append(
                    IOBifyer.generate_IOB_labels(
                        row, idx, self.tokenizer, self.dbg
                    )
                )
            except Exception as e:
                print("problem iobifyin row:", row)
                raise e
        return pd.Series(labels_row)


In [27]:
iob = IOBifyer(column='')
r = iob.transform(df)

In [1]:
r

In [2]:
df.shape, r.shape

In [3]:
df["IOB"] = np.nan

In [4]:
for i in range(len(df)):
    df.loc[i, "IOB"] = ' '.join(r[i])