In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

### TODO:
- Use the FLD data field tag to break string from field
- Remove long digit by xdig

In [None]:
from fastai.text import *
import html
import pandas as pd
import json
from requests import Session
MATIFY_API_EP = 'http://matify.net:8000/'
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
NUM = 'xdig'  # Number

In [None]:
PATH=Path('data/matify/')
PATH.mkdir(exist_ok=True)
PROD_FILE = PATH/'text'/'matify_product.csv'
CLAS_PATH = PATH/'classifier'
CLAS_PATH.mkdir(exist_ok=True)
LM_PATH=PATH/'language_model'
LM_PATH.mkdir(exist_ok=True)
W2V_PATH=PATH/'w2v'
W2V_PATH.mkdir(exist_ok=True)
W2V_FILE=W2V_PATH/'fasttext.no.pickle'

In [None]:
from niklib.w2v_model import Word2VecModel
w2v = Word2VecModel.from_pickle(W2V_FILE)

In [None]:
def get_SubCategories ():
    session = Session()
    categoryResponse = session.get(MATIFY_API_EP + 'listCategories/', 
                                   headers={'Accept':'application/json'})
    assert (int(categoryResponse.status_code) == 200), \
            "Error when requesting all categories. Response text: " + categoryResponse.text
    categories = json.loads(categoryResponse.text)
    for category in categories:
        for subCategory in category['sub_categories']:
            yield (subCategory['id'], subCategory['name'])

id2subcat = {id: subcat for id, subcat in get_SubCategories()}
id2subcat[142] = 'NotFood'
# id2subcat[-1] = 'NoLabel'

In [None]:
df_full = pd.read_csv(PROD_FILE)

In [None]:
df = pd.DataFrame()
df["labels"] = df_full["category_id"]
df["name"] = df_full["name"] 
df["description"] = df_full["description"]
df = df[~pd.isna(df["name"])] # remove no text
# df = df[~pd.isna(df["description"])] # remove no text
df["labels"].fillna(-1, inplace=True) # -1 for no label
df = df[df['labels'].isin(list(id2subcat.keys()) + [-1])] # only keep valid label, including no label
df.reset_index(inplace=True, drop=True)

In [None]:
# shuffle
np.random.seed(42)
idx = np.random.permutation(len(df))
df = df.iloc[idx]
df_w_labels = df[df["labels"]!=-1]

In [None]:
# Split label data
df_trn, df_val = sklearn.model_selection.train_test_split(df_w_labels, test_size=0.2, stratify=df_w_labels["labels"])
df_trn.to_csv(CLAS_PATH/'train.csv', header=False, index=False)
df_val.to_csv(CLAS_PATH/'test.csv', header=False, index=False)

In [None]:
classes = pd.DataFrame(list(id2subcat.items()), columns=['id', 'subcat'])
classes.to_csv(CLAS_PATH/'classes.csv', header=False, index=False)

In [None]:
df_trn, df_val = sklearn.model_selection.train_test_split(df)

In [None]:
len(df_trn), len(df_val)

In [None]:
df_trn.to_csv(LM_PATH/'train.csv', header=False, index=False)
df_val.to_csv(LM_PATH/'test.csv', header=False, index=False)

### Tokenize

In [None]:
chunksize=24000

In [None]:
def split_by_popularity(word, w2v):
    unknown_pop_score = w2v.n_vocabs
    if w2v.word2idx(word) > 0:
        return w2v.word2idx(word), [word]
        
    if (word.replace('.','',1).replace(',','',1).isdigit())  : #all numbers replaced by xdig. except single digit
        # return unknown_pop_score, [word] # Don't care about digit
        return unknown_pop_score, [f'{NUM}']

    if len(word)<=5:
        return unknown_pop_score, [word]


    best_pop_score = unknown_pop_score
    best_split = None
    best_nsplit = 3 # Max split to split
    for i_cut in range(len(word)-2, 1, -1):
        prefix, core = word[:i_cut], word[i_cut:]

        if w2v.word2idx(core) < 0:
            continue
        core_score = w2v.word2idx(core)

        if w2v.word2idx(prefix) < 0:
            prefix_score, prefix = split_by_popularity(prefix, w2v)
        else:
            prefix_score, prefix = w2v.word2idx(prefix), [prefix]                
        if (prefix_score >= unknown_pop_score): # Don't split if all splitted words are good
            continue
        pop_score = prefix_score + core_score
        words = prefix; words.append(core)

        if len(words) > best_nsplit: continue # Don't split more than best_nsplit
        if (len(words) < best_nsplit) or (pop_score < best_pop_score):
            #print(prefix, prefix_score, core_score, pop_score)
            best_split = words
            best_nsplit= len(words)
            best_pop_score = pop_score

    if best_split is None:
        return (unknown_pop_score, [word])
    else:
        return (best_pop_score, best_split)

In [None]:
re1 = re.compile(r'  +')

def fixup(x):
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))

In [None]:
def get_texts(df, w2v, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    texts = f'\n{BOS} {FLD}1 ' + df[n_lbls].astype(str)
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD}{i+1-n_lbls} ' + df[i].astype(str)
    texts = texts.apply(fixup).values.astype(str)

    alltoks = Tokenizer().proc_all_mp(partition_by_cores(texts)) # Note: apply our split_by_popularity algorithm
    alltoks = [[split_by_popularity(tok, w2v)[1] for tok in toks] for toks in alltoks]
    alltoks = [[newtok for newtoks in toks for newtok in newtoks] for toks in alltoks] # flatten list
    return alltoks, list(labels)

In [None]:
def get_all(df, w2v, n_lbls): # Used when df is chunked
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, w2v, n_lbls)
        tok += tok_
        labels += labels_
    return tok, labels

In [None]:
df_trn = pd.read_csv(LM_PATH/'train.csv', header=None, chunksize=chunksize)
df_val = pd.read_csv(LM_PATH/'test.csv', header=None, chunksize=chunksize)

In [None]:
# !python -m spacy download en

In [None]:
tok_trn, trn_labels = get_all(df_trn, w2v, 1)
tok_val, val_labels = get_all(df_val, w2v, 1)

In [None]:
tok_trn

In [None]:
(LM_PATH/'tmp').mkdir(exist_ok=True)

In [None]:
np.save(LM_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(LM_PATH/'tmp'/'tok_val.npy', tok_val)

In [None]:
freq = Counter(p for o in tok_trn for p in o)
freq.most_common(25)

In [None]:
max_vocab = 150000
min_freq = 1

In [None]:
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')

In [None]:
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [None]:
trn_lm = np.array([[stoi[o] for o in p] for p in tok_trn])
val_lm = np.array([[stoi[o] for o in p] for p in tok_val])

In [None]:
np.save(LM_PATH/'tmp'/'trn_ids.npy', trn_lm)
np.save(LM_PATH/'tmp'/'val_ids.npy', val_lm)
pickle.dump(itos, open(LM_PATH/'tmp'/'itos.pkl', 'wb'))

In [None]:
trn_lm = np.load(LM_PATH/'tmp'/'trn_ids.npy')
val_lm = np.load(LM_PATH/'tmp'/'val_ids.npy')
itos = pickle.load(open(LM_PATH/'tmp'/'itos.pkl', 'rb'))
stoi = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos)})

### Merge itos and stoi with Fasttext vocab and embedding


In [None]:
em_sz = 300
# Get itos2 and stoi2 of the W2V dict
itos2 = w2v.idx2word
stoi2 = collections.defaultdict(lambda:-1, {v:k for k,v in enumerate(itos2)})

# 1. Build vocab and emb for the first vocabs
row_m = w2v.idx2emb.mean(0)
new_w = np.zeros((max_vocab, em_sz), dtype=np.float32)
for i, w in enumerate(itos):
    r = stoi2[w]
    new_w[i] = w2v.idx2emb[r] if r>=0 else row_m
    
# 2. Get the rest from the second vocab
idx = len(itos)
for i, w in enumerate(itos2):
    if stoi[w] < 0:
        itos.append(w)
        stoi[w] = idx
        new_w[idx] = w2v.idx2emb[i]
        idx += 1
    if idx >= max_vocab: break

pickle.dump(itos, open(LM_PATH/'tmp'/'itos_full.pkl', 'wb'))
itos = pickle.load(open(LM_PATH/'tmp'/'itos_full.pkl', 'rb'))

### Build language model

In [None]:
em_sz, nh, nl = 300,600,1
vs=len(itos)

In [None]:
wd=1e-7
bptt=50 # instead of 70 bptt in fasttext
bs=32
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
trn_dl = LanguageModelLoader(np.concatenate(trn_lm), bs, bptt)
val_dl = LanguageModelLoader(np.concatenate(val_lm), bs, bptt)
md = LanguageModelData(PATH, 1, vs, trn_dl, val_dl, bs=bs, bptt=bptt)

In [None]:
drops = np.array([0.25, 0.1, 0.2, 0.02, 0.15])*0.7


In [None]:
learner= md.get_model(opt_fn, em_sz, nh, nl, 
    dropouti=drops[0], dropout=drops[1], wdrop=drops[2], dropoute=drops[3], dropouth=drops[4])
learner.metrics = [accuracy]
learner.freeze_to(-1)

In [None]:
wgts = learner.model.state_dict()
wgts['0.encoder.weight'] = T(new_w)
wgts['0.encoder_with_dropout.embed.weight'] = T(np.copy(new_w))
wgts['1.decoder.weight'] = T(np.copy(new_w))
learner.model.load_state_dict(wgts)

In [None]:
lr=1e-3
lrs = lr

In [None]:
learner.fit(lrs/2, 2, wds=wd, use_clr=(32,2), cycle_len=1, cycle_mult=2)

In [None]:
learner.save('lm_last_ft')
learner.load('lm_last_ft')
learner.unfreeze()
learner.lr_find(start_lr=lrs/10, end_lr=lrs*10, linear=True)

In [None]:
learner.sched.plot()

In [None]:
learner.fit(0.01/2, 1, wds=wd, use_clr=(20,10), cycle_len=15)

In [None]:
learner.fit(0.01/4, 3, wds=wd, cycle_len=1)

In [None]:
learner.save('lm1')
learner.save_encoder('lm1_enc')

In [None]:
learner.sched.plot_loss()

### Train classifier

In [None]:
df_trn = pd.read_csv(CLAS_PATH/'train.csv', header=None, chunksize=24000)
df_val = pd.read_csv(CLAS_PATH/'test.csv', header=None, chunksize=24000)
tok_trn, trn_labels = get_all(df_trn, w2v, 1)
tok_val, val_labels = get_all(df_val, w2v, 1)

In [None]:
(CLAS_PATH/'tmp').mkdir(exist_ok=True)

np.save(CLAS_PATH/'tmp'/'tok_trn.npy', tok_trn)
np.save(CLAS_PATH/'tmp'/'tok_val.npy', tok_val)

np.save(CLAS_PATH/'tmp'/'trn_labels.npy', trn_labels)
np.save(CLAS_PATH/'tmp'/'val_labels.npy', val_labels)
id2subcatid = list(np.unique(trn_labels))
pickle.dump(id2subcatid, open(CLAS_PATH/'id2subcatid.pkl', 'wb'))


In [None]:
tok_trn = np.load(CLAS_PATH/'tmp'/'tok_trn.npy')
tok_val = np.load(CLAS_PATH/'tmp'/'tok_val.npy')

In [None]:
itos = pickle.load((LM_PATH/'tmp'/'itos_full.pkl').open('rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
len(itos)

In [None]:
trn_clas = np.array([[stoi[o] for o in p] for p in tok_trn])
val_clas = np.array([[stoi[o] for o in p] for p in tok_val])

In [None]:
trn_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'trn_labels.npy'))
val_labels = np.squeeze(np.load(CLAS_PATH/'tmp'/'val_labels.npy'))
id2subcatid = pickle.load(open(CLAS_PATH/'id2subcatid.pkl', 'rb'))
subcatid2id = {v:k for k, v in enumerate(id2subcatid)}
trn_labels = np.array([ subcatid2id[label] for label in trn_labels ])
val_labels = np.array([ subcatid2id[label] for label in val_labels ])

In [None]:
bptt,em_sz,nh,nl = 50,300,600,1
c = len(id2subcatid)
vs = len(itos)
opt_fn = partial(optim.Adam, betas=(0.8, 0.99))
bs = 48

In [None]:
trn_ds = TextDataset(trn_clas, trn_labels)
val_ds = TextDataset(val_clas, val_labels)
trn_samp = SortishSampler(trn_clas, key=lambda x: len(trn_clas[x]), bs=bs//2)
val_samp = SortSampler(val_clas, key=lambda x: len(val_clas[x]))
trn_dl = DataLoader(trn_ds, bs//2, transpose=True, num_workers=1, pad_idx=1, sampler=trn_samp)
val_dl = DataLoader(val_ds, bs, transpose=True, num_workers=1, pad_idx=1, sampler=val_samp)
md = ModelData(PATH, trn_dl, val_dl)

In [None]:
dps = np.array([0.4,0.5,0.05,0.3,0.4])# * 0.5
# dps = np.array([0.2]*5)
m = get_rnn_classifer(bptt, 20*bptt, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 1000, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))

learn = RNN_Learner(md, TextModel(to_gpu(m)), opt_fn=opt_fn)
learn.reg_fn = partial(seq2seq_reg, alpha=2, beta=1)
learn.clip=25.
learn.metrics = [accuracy]

In [None]:
lr=3e-3
lrm = 2.6
lrs = np.array([lr/(lrm**4), lr/(lrm**3), lr/(lrm**2), lr/lrm, lr])
#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-2])
#lrs=np.array([1e-4,1e-4,1e-4,1e-3,1e-3])
lrs = 1e-3
wd = 1e-7
# wd = 0
learn.load_encoder('lm1_enc')

In [None]:
len(learn.models.get_layer_groups())

In [None]:
learn.freeze_to(-1)
learn.lr_find(lrs/1000)
learn.sched.plot()

In [None]:
learn.fit(lrs, 1, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
learn.save('clas_0')
learn.load('clas_0')
learn.freeze_to(-2)

In [None]:
learn.fit(lrs, 10, wds=wd, cycle_len=1, use_clr=(8,3))

In [None]:
learn.save('clas_1')
learn.load('clas_1')
learn.unfreeze()

In [None]:
learn.lr_find(lrs/1000)

In [None]:
learn.sched.plot()

In [None]:
learn.fit(lrs, 2, wds=wd, cycle_len=25, use_clr=(32,10), cycle_mult=1)

In [None]:
learn.sched.plot_lr()

In [None]:
learn.save('clas_final')

### Reload model

In [None]:
# USE_GPU = False
itos = pickle.load((LM_PATH/'tmp'/'itos_full.pkl').open('rb'))
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})
classes = pd.read_csv(CLAS_PATH/'classes.csv', header=None)
id2subcatid = pickle.load(open(CLAS_PATH/'id2subcatid.pkl', 'rb'))

bptt,em_sz,nh,nl = 40,300,600,1
c = len(id2subcatid)
vs = len(itos)
bs = 48

m = get_rnn_classifer(bptt, 20*bptt, c, vs, emb_sz=em_sz, n_hid=nh, n_layers=nl, pad_token=1,
          layers=[em_sz*3, 1000, c], drops=[dps[4], 0.1],
          dropouti=dps[0], wdrop=dps[1], dropoute=dps[2], dropouth=dps[3])
load_model(m, PATH/'models'/'clas_final.h5')
m = to_gpu(m)

In [None]:
df_test = pd.DataFrame(data=[[0, 'Street food grillfakkel Grill perfekt. 200 g. Pr kg 249,50'],
                   [0, 'Hatting pølsebrød'],
                   [0, 'Street food baby back ribs Grill perfekt. 800 g. Pr kg 186,25'],
                   [0, 'Santa maria bbq sauce 3 varianter. 420-470 g. Pr kg fra 63,62'],
                   [0, 'Grill perfekt kyllingfilet Tex. BBQ. 400 g. Pr kg 174,75'],
                   [0, 'Street food kyllingspyd Grill perfekt. Thai eller BBQ. 180 g. Pr kg 221,67'],
                   [0, 'Grill perfekt flintstek Krydret. Pr kg'],
                   [0, 'Coop gresk landbrød Butikkstekt. 560 g. Pr kg 44,46'],
                   [0, 'Coop fiber & frøbrød 650 g. Pr kg 30,62']]) # always put labels first and data field later

In [None]:
tok_test, test_labels = get_texts(df_test, w2v, 1)
test_clas = np.array([[stoi[o] for o in p] for p in tok_test])

In [None]:
tok_test

In [None]:
test_ds = TextDataset(test_clas, test_labels)
test_dl = DataLoader(test_ds, batch_size=100, transpose=True, num_workers=1, pad_idx=1)

In [None]:
m.reset()

In [None]:
test_clas

In [None]:
m.eval()
for x, y in test_dl:
    output = to_np(m(V(x)))[0] # Only get the final prediction, no hidden states
    preds = np.argmax(output, axis=1)

In [None]:
preds = [id2subcatid[id] for id in preds]

In [None]:
subcatid2name ={item[0]:item[1] for idx, item in classes.iterrows()}

In [None]:
preds_subcat = [subcatid2name[id] for id in preds]

In [None]:
preds_subcat

In [None]:
preds