In [1]:
import tensorflow as tf
import numpy as np
import scipy as sp
import sklearn as skl
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import math
import os
from pathlib import Path
from tensorflow import keras
from keras import layers
from keras import models
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from datetime import datetime, date, timedelta

# This script performs product identification. It builds/trains MHA + FCNN model and 
# performs clustrring of MHA output. At the begining code loads data from inidata.csv, 
# creates 2 training datasets and dataset of labels. 
# The first training set contains features such as:
# client type, client, outbound location, supplier, year, month (1..12), day (1..31), weekday (1..7), 
# quantity (shipment's volume), stop (time threshold)
# The second contains descriptions. Description symbols are recoded into tokens. 
# The tokenizer created in the "character embeddings v1.0" script is used for encoding.
# The Embedding layer is initialized with table "embeddings.html" also prepared in "character embeddings v1.0."

# The model shown in Figure 4 (gray background) of the report is built and trained.
# Embedded descriptions are submitted to the MHA. MHA output is combined with the first training dataset and fed 
# to deep FCNN. Labels are DoS CDF percentiles.
# Training is performed over the entire history without a test dataset.

# After training, the MHA output is fed to clustering. The clustering results are compared with 
# the correct answer. Errors are stored in the file: check_short.html

WORKING_DIRECTORY = 'C:/Pilot/test/'
os.chdir (WORKING_DIRECTORY)

fpLog = Path ('log.txt')

with open (fpLog, 'w') as flog:
    print ('model p1 v1.1 starts at : ', datetime.now(), file = flog)

DESC_LENGTH = 36    # DESC_LENGTH was calculated as maximum length of description
EMBEDDING_LENGTH = 16

fpini = Path ('inidata.csv')  
#dfdata = pd.read_csv (fpini, dtype='float').fillna(0)
dfdata = pd.read_csv (
    fpini, 
    dtype = {
        'tid':'float','cid':'float','oid':'float','sid':'float',
        'yy':'float','mm':'float','dd':'float','wd':'float',
        'is_i':np.int32,'txt':'str','nsl':'float','drn':'float','qnt':'float',
        'stop':'float','phash':np.int64,'T01':'float','T02':'float','T03':'float',
        'T04':'float','T05':'float','T06':'float','T07':'float','T08':'float',
        'T09':'float','T10':'float','T11':'float','T12':'float','T13':'float',
        'T14':'float','T15':'float','T16':'float','T17':'float','T18':'float',
        'T19':'float','T20':'float','did':'float',
        'tid_i':np.int32,'cid_i':np.int32,'oid_i':np.int32,'sid_i':np.int32,'did_i':np.int32
    }
    ).fillna(0)

dfdata = dfdata.sample (frac=1).reset_index (drop = True)

#fpwid = Path ('ref_words.html')  
#dfw = pd.read_html (fpwid, encoding = 'UTF-8')[0]
#dfw.set_index ('wid', inplace = True)
#dfw.loc[0] = ['']

#dfdata['txt'] = [
#    ' '.join ((dfw.word[w0],dfw.word[w1],dfw.word[w2],dfw.word[w3],dfw.word[w4],dfw.word[w5],dfw.word[w6])).ljust (DESC_LENGTH, ' ')
#    for w0, w1, w2, w3, w4, w5, w6 in zip (dfdata.w0,dfdata.w1,dfdata.w2,dfdata.w3,dfdata.w4,dfdata.w5,dfdata.w6)
#    ]

#dfdata.drop (['w0', 'w1', 'w2', 'w3', 'w4', 'w5', 'w6'], axis = 1, inplace = True)

_lX_ = ['tid', 'cid', 'oid', 'sid', 'yy', 'mm', 'dd', 'wd', 'qnt', 'stop']
_lY_ = ['T01','T02','T03','T04','T05','T06','T07','T08','T09','T10', 
        'T11','T12','T13','T14','T15','T16','T17','T18','T19','T20']

#dfX = dfdata.loc(axis = 1)[_lX_]
#dfX_stats = dfX.describe().transpose()
#dfdata.loc(axis = 1)[_lX_] = (dfX - dfX_stats['mean'])/dfX_stats['std']

#dstrain = dfdata

fpT = Path ('tokenizer.pickle')  
with open (fpT, 'rb') as handle:
    tokenizer = pickle.load (handle)

#naTraintxt = np.array (tokenizer.texts_to_sequences (dstrain.txt), dtype = np.int32) 
#naYtrain = np.array (dstrain.loc(axis = 1)[_lY_])
#naXtrain = np.array (dstrain.loc(axis = 1)[_lX_])

#ltext = [" ".join(x.split()) for x in df.txt.values.tolist()]
ltxt = [" ".join(x.split()).ljust (DESC_LENGTH) for x in dfdata.txt.values.tolist()]
naTraintxt = np.array (tokenizer.texts_to_sequences (ltxt), dtype = np.int32) 
naYtrain   = np.array (dfdata.loc(axis = 1)[_lY_])
naXtrain   = np.array (dfdata.loc(axis = 1)[_lX_])

trInputShipment = layers.Input (shape = (naXtrain.shape[1], ), name = "shipment")
trInputDesc = layers.Input (shape = (DESC_LENGTH, ), name = "desc") 

# Loading embedding matrix

fpE = Path ('embeddings.html')  
dfe = pd.read_html (fpE, encoding = 'UTF-8')[0].sort_values (['eid'], ignore_index = True)
dfe.drop (['char', 'tkid', 'eid'], axis=1, inplace = True)

naEmbeddingMatrix = np.array (dfe)

# Embed each character in the description into a 16-dimensional vector
trEmbedding = layers.Embedding (
    len (tokenizer.word_index) + 1,
    EMBEDDING_LENGTH,
    embeddings_initializer = keras.initializers.Constant (naEmbeddingMatrix),
    trainable = False,
    mask_zero = True
)(trInputDesc)

ktrMHA = layers.MultiHeadAttention (num_heads = 4, key_dim = 4)(trEmbedding, trEmbedding)

trMHAreduced = layers.Lambda (lambda xin: tf.keras.backend.sum (xin, axis=-2), name = 'LAYER_LAMBDA') (ktrMHA)

ktr = layers.concatenate ([trInputShipment, trMHAreduced], name = 'LAYER_CONCATENATE')

ktr = layers.Dense (units ='256', activation ='elu')(ktr)
ktr = layers.BatchNormalization ()(ktr)
ktr = layers.Dense (units ='256', activation ='elu')(ktr)
ktr = layers.BatchNormalization ()(ktr)
ktr = layers.Dense (units ='256', activation ='elu')(ktr)
ktr = layers.BatchNormalization ()(ktr)
ktr = layers.Dense (units ='256', activation ='elu')(ktr)
ktr = layers.BatchNormalization ()(ktr)
lrout = layers.Dense (units ='20', activation ='relu', name ='output')(ktr)

mdSimple = keras.Model (inputs = [trInputDesc, trInputShipment], outputs = lrout)

#mdSimple.summary()

iBatchSize = 64

optimizer = tf.keras.optimizers.Adam (learning_rate = 0.001)#(lr_schedule)

mdSimple.compile (
    optimizer = optimizer, 
    loss = tf.keras.losses.MeanSquaredError(), 
    metrics = tf.keras.metrics.RootMeanSquaredError()
    )

history = mdSimple.fit (
    {"shipment": naXtrain, "desc": naTraintxt},
    naYtrain,
    epochs = 100, 
    batch_size = iBatchSize 
    )

#naTxt = np.array (tokenizer.texts_to_sequences (dfdata.txt), dtype = np.int32) 

mdSub = models.Model (mdSimple.inputs, mdSimple.get_layer ('LAYER_LAMBDA').output)   
#naAttentionTxt = mdSub.predict ({"shipment": dfdata.loc[:,_lX_].to_numpy(), "desc": naTxt})
naAttentionTxt = mdSub.predict ({"shipment": dfdata.loc[:,_lX_].to_numpy(), "desc": naTraintxt})

kmeans = KMeans (n_clusters = 800).fit (naAttentionTxt)
#print ('kmeans.n_iter_: ', kmeans.n_iter_)
with open (fpLog, 'a') as flog:
    print ('kmeans.n_iter_: ', kmeans.n_iter_, file = flog)

naCluster_id = kmeans.predict (naAttentionTxt)
dfdata['cluster_id'] = naCluster_id

iUniq = len (pd.unique (dfdata['cluster_id']))
#print ('unique values in cluster_id: ', iUniq)
with open (fpLog, 'a') as flog:
    print ('unique values in cluster_id: ', iUniq, file = flog)

df = dfdata.groupby('cluster_id')['txt'].agg(['unique'])
df['cn'] = [len(x) for x in df.unique]

fplong = Path ('check_long.html')
df.loc (axis = 1)[['cn', 'unique']].to_html (fplong, index = True)
fpshort = Path ('check_short.html')
fpshortad = Path ('check_short_add.html')
dferr = df.loc (axis = 0)[df.cn != 3]
dferr.to_html (fpshort, index = True)
dferr.reset_index (inplace = True)
dferr = dferr.rename (columns = {'index':'cluster_id'})
dferr = dferr.merge (dfdata.loc (axis = 1)[['tid', 'cid', 'oid', 'sid', 'cluster_id', 'yy', 'mm', 'qnt']], how = 'inner', on = ('cluster_id'))
dferr = dferr.groupby(['tid', 'cid', 'oid', 'sid', 'cluster_id', 'yy', 'mm', 'qnt'])[['tid']].count()
dferr.to_html (fpshortad, index = True)

#dfdata.drop (['txt'], axis = 1, inplace = True)

#fpcsv = Path ('intermediate.csv')  
#dfdata.to_csv (fpcsv, index = False)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


  kmeans = KMeans (n_clusters = 800).fit (naAttentionTxt)
