In [1]:
import os
import csv
import fasttext
from tqdm.auto import tqdm
from typing import Tuple
import pandas as pd
import ml_collections
from sklearn.model_selection import train_test_split
from tokenizers import ByteLevelBPETokenizer
os.environ["TOKENIZERS_PARALLELISM"]="True"
tqdm.pandas()

# Setting up Model Hyper-Parameters
A dictionary of hyperparameters and a few helper functions are set up.

In [2]:
def model_config():
    cfg_dictionary = {
        "data_path": "dataset/train.csv",

        "corpus_file":"word-corpus.txt",
        "model_file":"fasttext-model.ftz",
        "tokenizer":"tokenizer.json",

        "wordNgrams":2,
        "test_size": 0.1,
        "validation_size":0.2,
        "loss":'hs',
        "epochs": 30,
    }
    config = ml_collections.FrozenConfigDict(cfg_dictionary)

    return config
cfg = model_config()

In [3]:
def txt_saver_util(file_path:str,df:pd.DataFrame):
    """
    Saves Dataframe to text (.txt) file
    Args:
        file_path(str) : path to where the generated file is to be saved
        df(pd.DataFrame) : dataframe to be saved.

    """
    df.to_csv(file_path,
                    index=False,
                    sep=' ',
                    header=False,
                    quoting=csv.QUOTE_NONE,
                    quotechar="",
                    escapechar=" ")

# Preprocessing
The dataset consists of multiple missing data and rows with imcomplete features. Those all missing rows are dropped. All fields `[TITLE, DESCRIPTION, BULLET_POINTS, BRAND]` are concatenated into a single field called `[TEXT]`. We conjecture that all the information about the product is necessary to make the correct browse ID classification.
The dataset is then split into training, validation and testing sets using sci-kit learn's `train_test_split` API. all labels are prepended with `__label__` as this is the format that fasttext consumes.

In [4]:
def preprocess(csv_path:str) -> pd.DataFrame:
    """
    Reads the csv, drops missing rows and concatenates columns and returns a dataframe
    Args:
        csv_path(str) : path the dataset file

    Returns:
        df(pd.DataFrame) : Preprocessed dataframe
    """
    df = pd.read_csv(csv_path,escapechar="\\", quoting=3)
    df = df.dropna()
    df['TEXT'] = df['TITLE']+" "+df['DESCRIPTION']+" "+df["BULLET_POINTS"]+" "+df["BRAND"]
    df = df.drop(['TITLE','DESCRIPTION','BULLET_POINTS','BRAND'],axis=1)
    return df

In [5]:
def create_splits(dataset_file:str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Creates train, validation, test split from the dataset.
    Converts dataframe to a format that can be consumed by fasttext, by prepending __label__ to the label.
    Args:
        dataset_file(str) : path to dataset stored in CSV format

    Returns:
        train_df (pd.DataFrame) : training data
        valid_df (pd.DataFrame) : validation data

    each row in the dataframe would now look like
    ```
    |__label__234 |Cat toys, small, medium, large for all cats...|
    ```
    `|` denotes column separator
    """
    print("Beginning Preprocessing ...")
    dataframe = preprocess(dataset_file)
    print("Preprocessing Done!")
    training_df, test_df = train_test_split(dataframe, test_size=cfg.test_size)
    train_df, valid_df = train_test_split(training_df, test_size=cfg.test_size)

    del training_df

    train_df.iloc[:, 0] = train_df.iloc[:, 0].progress_apply(lambda x: '__label__' + str(x))
    valid_df.iloc[:, 0] = valid_df.iloc[:, 0].progress_apply(lambda x: '__label__' + str(x))
    test_df.iloc[:, 0] = test_df.iloc[:, 0].progress_apply(lambda x: '__label__' + str(x))

    txt_saver_util('test.txt', test_df)

    return train_df, valid_df

# Tokenization
A custom BytePair Encoder is trained on the dataset corpus. Training a fasttext model on a custom tokenized dataset showed better performance than use the default fasttext encoding.

In [6]:
def tokenize_data():
    """ Tokenizes Training and Validation data with a BPE (BytePair Encoding) Tokenizer """
    train_df, valid_df = create_splits(cfg.data_path)

    def create_word_corpus(t_df:pd.DataFrame, v_df:pd.DataFrame):
        """
        Crates a word corpus to train the custom tokenizer on
        Args:
            t_df(pd.DataFrame) : training dataframe
            v_df(pd.DataFrame) : validation dataframe
        """
        corpus_df = pd.concat([t_df[['TEXT']],v_df[['TEXT']]])
        txt_saver_util(cfg.corpus_file,corpus_df)

    print("Creating corpus for tokenization ....")
    create_word_corpus(train_df, valid_df)
    print("Done creating word corpus !")

    custom_tokenizer = ByteLevelBPETokenizer(lowercase=True)
    print("Training Tokenizer ...")
    custom_tokenizer.train(cfg.corpus_file) # trains the tokenizer
    custom_tokenizer.save(cfg.tokenizer)
    print("Done creating tokenizer. Tokenizer saved as",cfg.tokenizer)

    def tokenize(training_df:pd.DataFrame, validation_df:pd.DataFrame, tokenizer):
        """
        Tokenizes the "TEXT" column of the dataframe
        Args:
            training_df(pd.DataFrame) : training Data
            validation_df(pd.DataFrame) : validation Data
            tokenizer(tokenizer.ByteLevelBPETokenizer) : ByteLevelBPETokenizer
        """
        training_df['TOKENIZED_TEXT'] = training_df['TEXT'].progress_apply(
            lambda text: " ".join(tokenizer.encode(text).tokens)
        )
        txt_saver_util('tokenized-train.txt',
            training_df[['TOKENIZED_TEXT', 'BROWSE_NODE_ID']])

        validation_df['TOKENIZED_TEXT'] = validation_df['TEXT'].progress_apply(
            lambda text: " ".join(tokenizer.encode(text).tokens)
        )
        txt_saver_util('tokenized-valid.txt',
            validation_df[['TOKENIZED_TEXT', 'BROWSE_NODE_ID']])
    print("Tokenizing data ...")
    tokenize(train_df, valid_df, custom_tokenizer)
    print("Done tokenizing data!")

# Training and Inference
The model in trained and its hyperparameters are autotuned for 30 minutes. The model size is limited to 100MiB as this is a restriction enforced by GitHub on the size of the trained model. A better performing model can be obtained by just removing the `autotuneModelSize` parameter. A hierarchical softmax is used as the loss function over the traditional softmax as it is much faster to train (8x in this case).

In [7]:
def train():
    """
    Trains the FastText model with tokenized data. Model hyper-parameters are tuned and
    the model size is limited to 99MB (GitHub Restriction)
    """
    tokenize_data()

    model = fasttext.train_supervised(
    input='tokenized-train.txt',
    wordNgrams = cfg.wordNgrams,
    loss=cfg.loss,
    epoch=cfg.epochs,
    autotuneValidationFile='tokenized-valid.txt',
    autotuneDuration= 1800,
    autotuneModelSize="99M")

    return model

In [8]:
trained_model = train()
trained_model.save_model(cfg.model_file)

Beginning Preprocessing ...
Preprocessing Done!


  0%|          | 0/1709574 [00:00<?, ?it/s]

  0%|          | 0/189953 [00:00<?, ?it/s]

  0%|          | 0/211059 [00:00<?, ?it/s]

Creating corpus for tokenization ....
Done creating word corpus !
Training Tokenizer ...



Done creating tokenizer. Tokenizer saved as tokenizer.json
Tokenizing data ...


  0%|          | 0/1709574 [00:00<?, ?it/s]

  0%|          | 0/189953 [00:00<?, ?it/s]

Done tokenizing data!


Progress: 100.0% Trials:    2 Best score:  0.684714 ETA:   0h 0m 0s
Training again with best arguments
Read 418M words
Number of words:  29862
Number of labels: 9509
Progress: 100.0% words/sec/thread: 3421756 lr:  0.000000 avg.loss:  0.950186 ETA:   0h 0m 0s
Progress: 100.0% words/sec/thread: 2778666 lr:  0.000000 avg.loss:  0.299240 ETA:   0h 0m 0s


In [9]:
trained_model.test('tokenized-valid.txt')

(189869, 0.6915346897071138, 0.6915346897071138)

In [10]:
from tokenizers import Tokenizer
loaded_tokenizer = Tokenizer.from_file(cfg.tokenizer)
loaded_model = fasttext.load_model(cfg.model_file)

def predict(text, tokenizer, model):
    tokenized_text = ' '.join(tokenizer.encode(text).tokens)
    return model.predict(tokenized_text)

text = 'Pujyadeep  Puja  Agarbatti  (A  Box  Containing  12  Packets)  (Rooh  A  Gulab)  ' \
       'One  box  contains  twelve  packets  of  Incense  Sticks.  Very  Good  Fragrance. ' \
       ' Alight  this  incense  to  offer  your  deepest  devotion  and  open  your  heart  ' \
       'to  the  divine.  Made  In  India'

print(predict(text, loaded_tokenizer, loaded_model))

(('__label__525',), array([0.43831539]))


