In [2]:
import csv
import fasttext
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('dataset/train.csv', escapechar="\\", quoting=3)
df.head()

Unnamed: 0,TITLE,DESCRIPTION,BULLET_POINTS,BRAND,BROWSE_NODE_ID
0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch","Pete the Cat is the coolest, most popular cat ...","[Pete the Cat Bedtime Blues plush doll,Based o...",MerryMakers,0
1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ...",The New Yorker Handsome Cello Wrapped Hard Mag...,[Cat In A Tea Cup by New Yorker cover artist G...,The New Yorker,1
2,The Ultimate Self-Sufficiency Handbook: A Comp...,,Skyhorse Publishing,imusti,2
3,Amway Nutrilite Kids Chewable Iron Tablets (100),,"[Nutrilite Kids,Chewable Iron Tablets,Quantity...",Amway,3
4,Teacher Planner Company A4 6 Lesson Academic T...,,,,4


In [3]:
df = df.dropna()
df['TEXT'] = df['TITLE']+" "+df['DESCRIPTION']+" "+df["BULLET_POINTS"]+" "+df["BRAND"]
df = df.drop(['TITLE','DESCRIPTION','BULLET_POINTS','BRAND'],axis=1)
df.head()

Unnamed: 0,BROWSE_NODE_ID,TEXT
0,0,"Pete The Cat Bedtime Blues Doll, 14.5 Inch Pet..."
1,1,"The New Yorker NYHM014 Refrigerator Magnet, 2 ..."
5,5,Men'S Full Sleeve Raglan T-Shirts Denim T-Shir...
6,6,Glance Women's Wallet (Black) (LW-21) This Bla...
7,7,Wild Animals Hungry Brain Educational Flash Ca...


In [4]:
training_df, test_df = train_test_split(df, test_size=0.1)
train_df, valid_df = train_test_split(training_df, test_size=0.1)

train_df.iloc[:, 1] = train_df.iloc[:, 1].apply(lambda x: '__label__' + str(x))
valid_df.iloc[:, 1] = valid_df.iloc[:, 1].apply(lambda x: '__label__' + str(x))

In [5]:
train_df[['BROWSE_NODE_ID','TEXT']].to_csv('train.txt',
                                            index=False,
                                            sep=' ',
                                            header=None,
                                            quoting=csv.QUOTE_NONE,
                                            quotechar="",
                                            escapechar=" ")

valid_df[['BROWSE_NODE_ID','TEXT']].to_csv('valid.txt',
                                            index=False,
                                            sep=' ',
                                            header=None,
                                            quoting=csv.QUOTE_NONE,
                                            quotechar="",
                                            escapechar=" ")

In [6]:
FASTTEXT_LABEL = '__label__'

def create_text_file(input_path: str, output_path: str, encoding:str ='utf-8'):
    with open(input_path, encoding=encoding) as f_in, open(output_path, 'w', encoding=encoding) as f_out:
        for line in f_in:
            try:
                tokens = []
                for token in line.split(" "):
                    if FASTTEXT_LABEL not in token:
                        tokens.append(token)
                text = " ".join(tokens)
            except ValueError as e:
                continue
            f_out.write(text)

create_text_file('train.txt','text-only-train.txt')
create_text_file('valid.txt','text-only-valid.txt')

In [7]:
!cat text-only-train.txt text-only-valid.txt > raw-text.txt

In [None]:
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=True)
tokenizer.train(
    'raw-text.txt',
    vocab_size=10000,
)

In [9]:
from tokenizers.implementations import BaseTokenizer


def tokenize_text(tokenizer: BaseTokenizer, text: str) -> str:
    """
    Given the raw text, tokenize it using the trained tokenizer and
    outputs the tokenized text.
    """
    return ' '.join(tokenizer.encode(text).tokens)

In [10]:
def create_tokenized_file(input_path: str, output_path: str,
                          tokenizer: BaseTokenizer, encoding: str='utf-8'):
    with open(input_path, encoding=encoding) as f_in, open(output_path, 'w', encoding=encoding) as f_out:

        for line in f_in:
            try:
                # the labels remains untouched during the preprocessing step as its
                # already in a format that fasttext can consume
                tokens = []
                labels = []
                for token in line.split(' '):
                    if FASTTEXT_LABEL in token:
                        labels.append(token)
                    else:
                        tokens.append(token)

                text = ' '.join(tokens)
                label = ' '.join(labels)
            except ValueError as e:
                continue

            tokenized_text = tokenize_text(tokenizer, text)
            new_line = label + ' ' + tokenized_text
            f_out.write(new_line)
            f_out.write('\n')

In [19]:
create_tokenized_file('train.txt', 'tokenized-text-train.txt', tokenizer)
create_tokenized_file('valid.txt', 'tokenized-text-valid.txt', tokenizer)

KeyboardInterrupt: 

In [3]:
model = fasttext.train_supervised('tokenized-text-train.txt', wordNgrams = 2,loss='hs', epoch=25)

Read 120M words
Number of words:  9915
Number of labels: 62694
Progress: 100.0% words/sec/thread: 2217244 lr:  0.000000 avg.loss:  7.254069 ETA:   0h 0m 0s avg.loss:  9.242062 ETA:   0h 2m20s8.973626 ETA:   0h 2m 9s 17.9% words/sec/thread: 2221267 lr:  0.082125 avg.loss:  8.829647 ETA:   0h 2m 4s 27.9% words/sec/thread: 2220682 lr:  0.072093 avg.loss:  8.516129 ETA:   0h 1m48s 28.0% words/sec/thread: 2220738 lr:  0.071953 avg.loss:  8.513479 ETA:   0h 1m48s 50.9% words/sec/thread: 2215544 lr:  0.049096 avg.loss:  7.971142 ETA:   0h 1m14s


In [10]:
model.save_model('fasttext-custom-tokenizer-amazon.bin')