In [1]:
import csv
import fasttext
from tqdm.auto import tqdm
import pandas as pd
import ml_collections
from sklearn.model_selection import train_test_split
from tokenizers import ByteLevelBPETokenizer
tqdm.pandas()

In [2]:
def model_config():
    cfg_dictionary = {
        "data_path": "dataset/train.csv",

        "corpus_file":"word-corpus.txt",
        "model_file":"model.fasttext",

        "wordNgrams":2,
        "test_size": 0.1,
        "validation_size":0.2,
        "loss":'hs',
        "epochs": 30,
    }
    cfg = ml_collections.FrozenConfigDict(cfg_dictionary)

    return cfg
cfg = model_config()

In [3]:
def preprocess(csv_path:str):
    """ Reads the csv, concatenates the columns and returns a dataframe """
    df = pd.read_csv(csv_path,escapechar="\\", quoting=3)
    df = df.dropna()
    df['TEXT'] = df['TITLE']+" "+df['DESCRIPTION']+" "+df["BULLET_POINTS"]+" "+df["BRAND"]
    df = df.drop(['TITLE','DESCRIPTION','BULLET_POINTS','BRAND'],axis=1)
    return df

In [4]:
def create_splits(dataset_file:str):
    """
    Creates train, validation, test split from the dataset.
    Converts dataframe to a format that can be consumed by fasttext, by prepending __label__ to the label.
    each row in the dataframe would now look like
    ```
    |__label__234 |Cat toys, small, medium, large for all cats...|
    ```
    `|` denotes column separator
    """
    print("Beginning Preprocessing ...")
    dataframe = preprocess(dataset_file)
    print("Preprocessing Done!")
    training_df, test_df = train_test_split(dataframe, test_size=0.1)
    train_df, valid_df = train_test_split(training_df, test_size=0.1)

    del training_df

    train_df.iloc[:, 0] = train_df.iloc[:, 0].apply(lambda x: '__label__' + str(x))
    valid_df.iloc[:, 0] = valid_df.iloc[:, 0].apply(lambda x: '__label__' + str(x))

    return train_df, valid_df

In [9]:
def tokenize_data():
    train_df, valid_df = create_splits(cfg.data_path)

    def create_word_corpus(t_df, v_df):
        corpus_df = pd.concat([t_df[['TEXT']],v_df[['TEXT']]])
        corpus_df.to_csv(
                    cfg.corpus_file,
                    index=False,
                    sep=' ',
                    header=None,
                    quoting=csv.QUOTE_NONE,
                    quotechar="",
                    escapechar=" ")
    print("Creating corpus for tokenization ....")
    create_word_corpus(train_df, valid_df)
    print("Done creating word corpus !")

    custom_tokenizer = ByteLevelBPETokenizer(lowercase=True)
    print("Training Tokenizer ...")
    custom_tokenizer.train(cfg.corpus_file)
    custom_tokenizer.save('tokenizer.json')
    print("Done creating tokenizer. Tokenizer saved as `tokenizer.json`")

    def tokenize(training_df, validation_df, tokenizer):
        training_df['TOKENIZED_TEXT'] = training_df['TEXT'].progress_apply(
            lambda text: " ".join(tokenizer.encode(text).tokens)
        )
        training_df[['TOKENIZED_TEXT', 'BROWSE_NODE_ID']].to_csv(
                                            'tokenized-train.txt',
                                            index=False,
                                            sep=' ',
                                            header=None,
                                            quoting=csv.QUOTE_NONE,
                                            quotechar="",
                                            escapechar=" ")

        validation_df['TOKENIZED_TEXT'] = validation_df['TEXT'].progress_apply(
            lambda text: " ".join(tokenizer.encode(text).tokens)
        )
        validation_df[['TOKENIZED_TEXT', 'BROWSE_NODE_ID']].to_csv(
                                            'tokenized-valid.txt',
                                            index=False,
                                            sep=' ',
                                            header=None,
                                            quoting=csv.QUOTE_NONE,
                                            quotechar="",
                                            escapechar=" ")
    print("Tokenizing data ...")
    tokenize(train_df, valid_df, custom_tokenizer)
    print("Done tokenizing data!")

In [10]:
tokenize_data()




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1709574 [00:00<?, ?it/s]

  0%|          | 0/189953 [00:00<?, ?it/s]

ValueError: train.txt cannot be opened for training!

In [11]:
model = fasttext.train_supervised('tokenized-train.txt', wordNgrams = cfg.wordNgrams,loss=cfg.loss, epoch=cfg.epochs)
model.save_model(cfg.model_file)

Read 418M words
Number of words:  29856
Number of labels: 9521
Progress: 100.0% words/sec/thread: 3445313 lr:  0.000000 avg.loss:  0.937019 ETA:   0h 0m 0s


In [12]:
model.test('tokenized-valid.txt')

(189880, 0.7365388666526227, 0.7365388666526227)

In [14]:
tuned_model = fasttext.train_supervised(input='tokenized-train.txt',loss=cfg.loss, autotuneValidationFile='tokenized-valid.txt', autotuneDuration=1800)

Progress: 100.0% Trials:    6 Best score:  0.659617 ETA:   0h 0m 0s
Training again with best arguments
Read 418M words
Number of words:  29856
Number of labels: 9521
Progress: 100.0% words/sec/thread: 2907664 lr:  0.000000 avg.loss:  1.624883 ETA:   0h 0m 0s
