In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,GlobalAveragePooling1D,Embedding

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt
import pickle



import tensorflow as tf
print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")


Version:  1.15.0
Eager mode:  False
GPU is available


In [3]:
from datetime import datetime
from nltk.stem import PorterStemmer
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import strip_punctuation
from gensim.parsing.preprocessing import strip_multiple_whitespaces
from gensim.parsing.preprocessing import strip_non_alphanum
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
import spacy
import re
from unidecode import unidecode
import multiprocessing as mp
from math import log
# basic
import os
import warnings
import json
warnings.filterwarnings("ignore")

def strip_custom(token):
    token = token.replace('&Reg;', " ")
    token = token.replace("&lt;", "<")
    token = token.replace("&times;", "")
    token = token.replace("&gt;", ">")
    token = token.replace("&quot;", "")
    token = token.replace('&nbsp', " ")
    token = token.replace('&copy;', " ")
    token = token.replace('&reg', " ")
    token = token.replace('%20', " ")
    # this has to be last:
    token = token.replace("&amp;", "&")
    token = token.replace("â\x80¢", " ")
    token = token.replace("Â®", " ")
    token = token.replace("Ã©", " ")
    token = token.replace("®", " ")
    token = token.replace("©", " ")
    token = token.replace("™", " ")
    token = token.replace("•", "")
    token = token.replace("width:99pt", "")
    token = token.replace('class="xl66">', '')
    #token = re.sub(r"\'", '', token)
    token = token.replace('&#160;', ' ')
    token = token.replace("�", "")
    return token


def string_processor(token):
#     str = str(token)
    str = unidecode(token)
    str = strip_custom(str)
    str = remove_stopwords(str)
    str = strip_punctuation(str)
    str = strip_non_alphanum(str)
#     tokens = sp(str)
#     tokens = [token.lemma_ for token in tokens]
#     tokens = [porter_stemmer.stem(token) for token in tokens]
#     str = " ".join(tokens)
    str = strip_multiple_whitespaces(str)
    str = str.strip(' ')
    return str


In [12]:
df = pd.read_csv('../beauty.csv')
feature_column = "name"
if feature_column == 'name':
    df = df[['bucket_name','product_name']]
    
if feature_column == 'description':
    df = df[['bucket_name','description']]

df.columns = ['bucket_name', 'feature'] 
print("before drop: ",len(df))
df = df.dropna()
print("after drop: ",len(df))
# to lowercase
df.bucket_name = df.bucket_name.apply(lambda x : x.lower())
df.feature = df.feature.apply(lambda x : string_processor(x))
df.head()


before drop:  87877
after drop:  87854


Unnamed: 0,bucket_name,feature
0,"vitamins, minerals, & dietary supplements",Neuro 1 Orange Cream 1 37 Pound Powder
1,"vitamins, minerals, & dietary supplements",Lean 1 Vanilla Raspberry 1 7 Pound Powder
2,"vitamins, minerals, & dietary supplements",Lean 1 Fat Burning Meal Replacement Chocolate ...
3,"vitamins, minerals, & dietary supplements",Vegan 1 VANILLA 1 5 Pound Powder
4,"vitamins, minerals, & dietary supplements",Vegan 1 BANANA 1 5 Pound Powder


In [4]:
print("categories:",len(df.bucket_name.unique()))
CLASSES = LabelEncoder()
df['label']=CLASSES.fit_transform(df['bucket_name'])

NUM_OF_CLASS=len(CLASSES.classes_)
label = df.label.tolist()
encoder_filename = 'model/beauty_saved_encoder.npy'
np.save(encoder_filename, CLASSES.classes_)

categories: 66


In [5]:
def processing_sequences(x_test,maxlen=16):
    print(len(x_test), 'test sequences')

    print('Average test sequence length: {}'.format(
        np.mean(list(map(len, x_test)))))
    print('Pad sequences (samples x time)')
    x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

    print('x_test shape:', x_test.shape)
    return x_test

def get_token_for_sentence(corpus,name):
#     corpus = df.description.tolist()


    vocabulary_size = 30000
    tokenizer = Tokenizer(num_words= vocabulary_size, 
                          filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 
                          lower=True, 
                          split=' ')

    tokenizer.fit_on_texts(corpus)
    sequences = tokenizer.texts_to_sequences(corpus)

    top_word = len(tokenizer.index_word) +1
    print("vocab:",top_word)

    # saving
    with open(name+'tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    return np.array(sequences)

In [6]:
name_token = get_token_for_sentence(df.product_name.tolist(),'name')
des_token = get_token_for_sentence(df.description.tolist(),'des')

X_name = processing_sequences(name_token,maxlen=16)
X_des = processing_sequences(des_token,maxlen=75)
y = np.array(label)

vocab: 17293
vocab: 80389
87210 test sequences
Average test sequence length: 7.140247678018576
Pad sequences (samples x time)
x_test shape: (87210, 16)
87210 test sequences
Average test sequence length: 69.83360853113174
Pad sequences (samples x time)
x_test shape: (87210, 75)


In [7]:
def eval_model(x_train, x_test, y_train, y_test,fold_number,feature):
    current_model_name = "model/"+feature+str(fold_number)+"_best.h5"


    max_features = 30000
    batch_size = 32
    embedding_dims = 100
    epochs = 35
    Patience = 3
    maxlen = x_train.shape[1]


    print('Build model...')
    model = Sequential()

    # we start off with an efficient embedding layer which maps
    # our vocab indices into embedding_dims dimensions
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=maxlen))

    # we add a GlobalAveragePooling1D, which will average the embeddings
    # of all words in the document
    model.add(GlobalAveragePooling1D())

    # We project onto a single unit output layer, and squash it with a sigmoid:
    model.add(Dense(NUM_OF_CLASS, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    print(model.summary())
    checkpoint = ModelCheckpoint(current_model_name, monitor='val_loss', verbose=1, save_best_only=True)
    history = model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_test, y_test),
              callbacks=[EarlyStopping(monitor='val_loss', patience=Patience),checkpoint], 
              verbose=1 )
    
    model.load_weights(current_model_name)
#     yhat = np.argmax(model.predict(x_test),axis=1)
#     return f1_score(y_test,yhat,average='macro')
    yhat = model.predict(x_test)
    
    print('\n\n in validation')

    print(classification_report(y_test, np.argmax(yhat ,axis=1), target_names=CLASSES.classes_))
    model.save(feature+'my_model.h5')
    return yhat


In [8]:
X_name_train, X_name_test, X_des_train, X_des_test,y_train, y_test = train_test_split(X_name, X_des, y, test_size=0.01, stratify=y)

In [9]:
name_yhat = eval_model(X_name_train, X_name_test, y_train, y_test,fold_number='local',feature='name')
des_yhat = eval_model(X_des_train, X_des_test, y_train, y_test,fold_number='local',feature='description')

yhat = np.argmax(name_yhat+des_yhat,axis=1)


score = f1_score(y_test,yhat,average='macro')
print(classification_report(y_test, yhat, target_names=CLASSES.classes_,digits=4))

Build model...
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 16, 100)           3000000   
_________________________________________________________________
global_average_pooling1d (Gl (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 66)                6666      
Total params: 3,006,666
Trainable params: 3,006,666
Non-trainable params: 0
_________________________________________________________________
None
Train on 86337 samples, validate on 873 samples
Epoch 1/35
Epoch 00001: val_loss improved from inf to 0.74584, saving model to model/namelocal_best.h5
Epoch 