In [1]:
import os

base_path = "/media/ohtar10/Adder-Storage/datasets/pre-processed/product-documents-tiny"

artifacts_path = os.path.join(os.path.curdir, 'artifacts/')
models_path = os.path.join(artifacts_path, 'models/')
sklearn_models = os.path.join(models_path, 'sklearn/')
data_path = os.path.join(artifacts_path, 'data/')

In [2]:
import pickle

categories_encoder_path = os.path.join(sklearn_models, 'category_encoder.pkl')
categories_encoder = None
with open(categories_encoder_path, 'rb') as file:
    categories_encoder = pickle.load(file)

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

class DocumentTokenizer(BaseEstimator, TransformerMixin):

    def __init__(self, corpus_column: str, 
                lowercase: bool = True,
                vocab_size=None, 
                maxlen=None,
                word_index=None):
        self.corpus_column = corpus_column
        self.lowercase = lowercase
        self.vocab_size = vocab_size
        self.maxlen = maxlen
        if word_index:
            with open(word_index, 'rb') as file:
                self.word_index = pickle.load(file)
        else:
            self.word_index = None

    def fit(self, X, y=None):
        word_count = X[self.corpus_column].apply(
                lambda corpus: self.tokenizer.tokenize(corpus.lower() if self.lowercase else corpus)
            ).explode().value_counts().sort_values(ascending=False)

        if self.vocab_size:
            word_count = word_count.iloc[0:self.vocab_size]
        word_index = word_count.reset_index()['index'].to_dict()
        self.word_index = {v:k for k, v in word_index.items()}
        return self

    def transform(self, X, y=None):
        def tokenize(row):
            tokens = [self.word_index[token] for token in row if token in self.word_index]
            return [0] * (self.maxlen - len(tokens)) + tokens[:self.maxlen]
        
        words = X[self.corpus_column].str.lower().str.split(f'[^\w+]', expand=True)
        return words.apply(tokenize, axis=1, result_type='expand').values

In [4]:
word_index_path = os.path.join(data_path, 'word_index.pkl')
doc_tokenizer = DocumentTokenizer("document", maxlen=300, word_index=word_index_path)


In [5]:
import tensorflow.keras as keras
import glob
from tensorflow.keras.utils import Sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

class DocumentDataGenerator(Sequence):
    def __init__(self, path, 
        doc_column, 
        cat_column,
        category_encoder,
        tokenizer,
        shuffle=True, 
        to_fit=True):
        self.path = path
        self.doc_column = doc_column
        self.cat_column = cat_column
        self.category_encoder = category_encoder
        self.tokenizer = tokenizer
        self.shuffle = shuffle
        self.to_fit = to_fit
        self.fileset = [file for file in glob.glob(f"{path}/*.parquet", recursive=True)]

    def __len__(self):
        """Denotes the number of batches per epoch"""
        return len(self.fileset)

    def __getitem__(self, index):
        """Generate one batch of data"""
        df = pd.read_parquet(self.fileset[index])
        X = self.tokenizer.transform(df)
        y = df[self.cat_column].apply(lambda cat: cat.split(";")).values.tolist()
        y = self.category_encoder.transform(y)

        if self.to_fit:
            return X, y
        else:
            return X


In [6]:
training_generator = DocumentDataGenerator(base_path, 'document', 'categories', categories_encoder, doc_tokenizer)
testing_generator = DocumentDataGenerator(base_path, 'document', 'categories', categories_encoder, doc_tokenizer)

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

model = Sequential()
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(10, activation='softmax'))

model.compile(optimizer="adam", loss='kullback_leibler_divergence', metrics=['accuracy'])

In [8]:
%%time
history = model.fit(training_generator, validation_data=testing_generator,  epochs=5)

  ...
    to  
  ['...']
Train for 10 steps, validate for 10 steps
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: user 32min 22s, sys: 32.7 s, total: 32min 55s
Wall time: 32min 25s
