In [1]:
# coding: utf-8

# mainly forking from notebook
# https://www.kaggle.com/johnfarrell/simple-rnn-with-keras-script

# ADDED
# 5x scaled test set
# category name embedding
# some small changes like lr, decay, batch_size~

import os
import gc
import time
start_time = time.time()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

pd.set_option('display.max_colwidth', 500)

In [2]:
train = pd.read_csv('data/train.tsv', sep='\t')
test = pd.read_csv('data/test.tsv', sep='\t')

train['target'] = np.log1p(train['price'])


print(train.shape)
print('5 folds scaling the test_df')
print(test.shape)
test_len = test.shape[0]
def simulate_test(test):
    if test.shape[0] < 800000:
        indices = np.random.choice(test.index.values, 2800000)
        test_ = pd.concat([test, test.iloc[indices]], axis=0)
        return test_.copy()
    else:
        return test
test = simulate_test(test)
print('new shape ', test.shape)
print('[{}] Finished scaling test set...'.format(time.time() - start_time))

(1482535, 9)
5 folds scaling the test_df
(693359, 7)
new shape  (3493359, 7)
[12.549781084060669] Finished scaling test set...


In [3]:
#HANDLE MISSING VALUES
print("Handling missing values...")
def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

train = handle_missing(train)
test = handle_missing(test)
print(train.shape)
print(test.shape)

print('[{}] Finished handling missing data...'.format(time.time() - start_time))

Handling missing values...
(1482535, 9)
(3493359, 7)
[13.26332139968872] Finished handling missing data...


In [4]:
#PROCESS CATEGORICAL DATA
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
print("Handling categorical variables...")
le = LabelEncoder()

le.fit(np.hstack([train.category_name, test.category_name]))
train['category'] = le.transform(train.category_name)
test['category'] = le.transform(test.category_name)

le.fit(np.hstack([train.brand_name, test.brand_name]))
train['brand'] = le.transform(train.brand_name)
test['brand'] = le.transform(test.brand_name)
del le, train['brand_name'], test['brand_name']

print('[{}] Finished PROCESSING CATEGORICAL DATA...'.format(time.time() - start_time))
train.head(3)

Handling categorical variables...
[30.39798402786255] Finished PROCESSING CATEGORICAL DATA...


Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,target,category,brand
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,10.0,1,No description yet,2.397895,829,5265
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,52.0,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,3.970292,86,3889
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,10.0,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",2.397895,1277,4588


In [5]:
train.head(100)

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,target,category,brand
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,10.0,1,No description yet,2.397895,829,5265
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,52.0,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,3.970292,86,3889
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,10.0,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",2.397895,1277,4588
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,35.0,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,3.583519,503,5265
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,44.0,0,Complete with certificate of authenticity,3.806662,1204,5265
5,5,Bundled items requested for Ruie,3,Women/Other/Other,59.0,0,"Banana republic bottoms, Candies skirt with matching blazer,Amy Byers suit, Loft bottoms and cami top.",4.094345,1216,5265
6,6,Acacia pacific tides santorini top,3,Women/Swimwear/Two-Piece,64.0,0,"Size small but straps slightly shortened to fit xs, besides that, perfect condition",4.174387,1276,84
7,7,Girls cheer and tumbling bundle of 7,3,Sports & Outdoors/Apparel/Girls,6.0,1,You get three pairs of Sophie cheer shorts size small and medium girls and two sports bra/boy shorts spandex matching sets in small and medium girls. All items total retail for [rm] in store and you can take him today for less than the price of one item at the store!),1.945910,908,4341
8,8,Girls Nike Pro shorts,3,Sports & Outdoors/Apparel/Girls,19.0,0,Girls Size small Plus green. Three shorts total.,2.995732,908,3337
9,9,Porcelain clown doll checker pants VTG,3,Vintage & Collectibles/Collectibles/Doll,8.0,0,"I realized his pants are on backwards after the picture. They were very dirty so I hand washed them. He has a stuffed body and painted porcelain head, hands and feet. Back before clowns were too scary. 9"" tall. No chips or cracks but minor paint loss in a few places. Clown Circus Doll Collectible",2.197225,1045,5265


In [6]:
train = train[train.item_description != 'No description yet'].copy()

In [7]:
#PROCESS TEXT: RAW
print("Text to seq process...")
print("   Fitting tokenizer...")
from keras.preprocessing.text import Tokenizer

raw_text = np.hstack([train.category_name.str.lower(), 
                      train.item_description.str.lower(), 
                      train.name.str.lower()])

tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
print("   Transforming text to seq...")
train["seq_category_name"] = tok_raw.texts_to_sequences(train.category_name.str.lower())
test["seq_category_name"] = tok_raw.texts_to_sequences(test.category_name.str.lower())
train["seq_item_description"] = tok_raw.texts_to_sequences(train.item_description.str.lower())
test["seq_item_description"] = tok_raw.texts_to_sequences(test.item_description.str.lower())
train["seq_name"] = tok_raw.texts_to_sequences(train.name.str.lower())
test["seq_name"] = tok_raw.texts_to_sequences(test.name.str.lower())
train.head(3)

print('[{}] Finished PROCESSING TEXT DATA...'.format(time.time() - start_time))

Text to seq process...
   Fitting tokenizer...


Using TensorFlow backend.


   Transforming text to seq...
[281.5643377304077] Finished PROCESSING TEXT DATA...


In [8]:
tok_raw.word_index

{'thymes': 68216,
 'evision': 195898,
 'shakillyia': 217715,
 'carrillo': 226950,
 'moomin': 57720,
 'chandon': 115122,
 'fixodent': 228979,
 'slubknit': 176586,
 'lvu3': 254140,
 'scalebound': 196545,
 'sqeezey': 214698,
 'ysing': 181225,
 'borderlands': 14281,
 'fairmoor': 142639,
 'greeklife': 161282,
 'terminators': 91359,
 'merdonl': 157350,
 '743a': 150591,
 'ninetendogs': 203486,
 'kharki': 222096,
 'bejeweled': 17311,
 'atomized': 166494,
 'zullala': 103554,
 'cassandra': 16693,
 'fledgling': 195911,
 '\xa08': 56441,
 'ntt10': 109003,
 'bolted': 89617,
 '30x33': 32706,
 'bellamia': 236825,
 "5''l": 88529,
 '“graphite”': 229013,
 '3349': 159978,
 'urbandecayvintage': 124022,
 'sturdiness': 37981,
 'ct5113': 146573,
 'letti': 194247,
 'iridescence': 24922,
 'strappie': 197443,
 'w1233': 212405,
 'miglin': 86908,
 'color\xa0led': 175480,
 'faberge': 53213,
 'manucure': 74369,
 '•locks': 19594,
 'kar': 48242,
 'britch': 181077,
 'hollywoodtlc': 214433,
 'wach': 54799,
 'legendofthe

In [9]:
train.head(3)

Unnamed: 0,train_id,name,item_condition_id,category_name,price,shipping,item_description,target,category,brand,seq_category_name,seq_item_description,seq_name
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,52.0,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.,3.970292,86,3889,"[64, 948, 850, 3361, 1384]","[32, 2755, 11, 8, 50, 17, 1, 253, 65, 21, 1207, 80, 12, 2, 72, 42, 12, 2, 3279, 27, 846, 1, 442, 808, 2, 1319, 27, 7799, 1375, 2, 10964, 56418, 1756, 14, 62, 1095]","[10964, 25487, 16578, 2755]"
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,10.0,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!",2.397895,1277,4588,"[3, 48, 82, 289]","[693, 73, 10, 5, 5441, 12, 243, 1, 5, 993, 1393, 8, 2, 130, 2, 2057, 28, 11, 5, 797, 1, 15, 191, 54, 5, 1243, 218, 8, 76]","[7741, 10738, 289]"
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,35.0,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage,3.583519,503,5265,"[38, 38, 208, 38, 208, 538]","[6, 10, 78, 224, 6637, 279, 4, 22, 205, 1190, 266, 5, 1689, 164, 131, 27, 920, 379, 69, 5, 274, 109, 210, 67, 171, 13, 19, 105, 551, 80, 12, 597]","[224, 2745, 657]"


In [10]:
#EXTRACT DEVELOPTMENT TEST
from sklearn.model_selection import train_test_split
dtrain, dvalid = train_test_split(train, random_state=666, test_size=0.0338)
print(dtrain.shape)
print(dvalid.shape)

(1352724, 13)
(47322, 13)


In [11]:
#EMBEDDINGS MAX VALUE
#Base on the histograms, we select the next lengths
MAX_NAME_SEQ = 20 #17
MAX_ITEM_DESC_SEQ = 60 #269
MAX_CATEGORY_NAME_SEQ = 20 #8
MAX_TEXT = np.max([np.max(train.seq_name.max())
                   , np.max(test.seq_name.max())
                   , np.max(train.seq_category_name.max())
                   , np.max(test.seq_category_name.max())
                   , np.max(train.seq_item_description.max())
                   , np.max(test.seq_item_description.max())])+2
MAX_CATEGORY = np.max([train.category.max(), test.category.max()])+1
MAX_BRAND = np.max([train.brand.max(), test.brand.max()])+1
MAX_CONDITION = np.max([train.item_condition_id.max(), 
                        test.item_condition_id.max()])+1

print('[{}] Finished EMBEDDINGS MAX VALUE...'.format(time.time() - start_time))

[283.5490651130676] Finished EMBEDDINGS MAX VALUE...


In [12]:
#KERAS DATA DEFINITION
from keras.preprocessing.sequence import pad_sequences

def get_keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description
                                    , maxlen=MAX_ITEM_DESC_SEQ)
        ,'brand': np.array(dataset.brand)
        ,'category': np.array(dataset.category)
        ,'category_name': pad_sequences(dataset.seq_category_name
                                        , maxlen=MAX_CATEGORY_NAME_SEQ)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset[["shipping"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
X_test = get_keras_data(test)

print('[{}] Finished DATA PREPARARTION...'.format(time.time() - start_time))

[331.73213505744934] Finished DATA PREPARARTION...


In [13]:
X_train

{'brand': array([ 519, 5265, 5265, ..., 3255, 5265, 5265]),
 'category': array([  35,  752,  711, ..., 1155,  863,  724]),
 'category_name': array([[   0,    0,    0, ...,  133,  137,  135],
        [   0,    0,    0, ...,  327,  235, 1065],
        [   0,    0,    0, ...,   37,  139, 2019],
        ..., 
        [   0,    0,    0, ..., 1122,  415,  154],
        [   0,    0,    0, ...,   87, 1423,  137],
        [   0,    0,    0, ...,   46,   57, 1586]], dtype=int32),
 'item_condition': array([1, 3, 2, ..., 2, 1, 1]),
 'item_desc': array([[11457,   172,    51, ...,    18,     4,   366],
        [    0,     0,     0, ...,   636,  1257,   822],
        [    0,     0,     0, ...,  4183,  2670,   610],
        ..., 
        [    0,     0,     0, ...,   375,   493,   459],
        [  968,    55,  3391, ...,  1710,  1201,  1634],
        [    0,     0,     0, ...,     5, 10464,  1030]], dtype=int32),
 'name': array([[   0,    0,    0, ...,  135,  137,   31],
        [   0,    0,    0, ...,

In [14]:
#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, \
    Activation, GRU, Embedding, Flatten, LSTM,concatenate
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping#, TensorBoard
from keras import backend as K
from keras import optimizers
from keras import initializers
import tensorflow as tf

def rmsle(y, y_pred):
    import math
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 \
              for i, pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5

def rmsle_tf(y, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y)))

In [15]:
dr = 0.25

def get_model():
    #params
    dr_r = dr
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand = Input(shape=[1], name="brand")
    category = Input(shape=[1], name="category")
    category_name = Input(shape=[X_train["category_name"].shape[1]], 
                          name="category_name")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    #Embeddings layers
    emb_size = 60
    
    emb_name = Embedding(MAX_TEXT, emb_size//3)(name)
    emb_item_desc = Embedding(MAX_TEXT, emb_size)(item_desc)
    emb_category_name = Embedding(MAX_TEXT, emb_size//3)(category_name)
    emb_brand = Embedding(MAX_BRAND, 10)(brand)
    emb_category = Embedding(MAX_CATEGORY, 10)(category)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_category_name)
    rnn_layer3 = GRU(4) (emb_name)
    
    #main layer
    main_l = concatenate([
       Flatten() (emb_brand)
        , Flatten() (emb_category)
        , Flatten() (emb_item_condition)
        , rnn_layer1
        , rnn_layer2
        , rnn_layer3
        , num_vars])
    

    
    main_l = Dropout(0.1)(Dense(512,activation='relu') (main_l))
    main_l = Dropout(0.1)(Dense(64,activation='relu') (main_l))
    
    #output
    output = Dense(1,activation="linear") (main_l)
    
    #model
    model = Model([name, item_desc, brand
                   , category, category_name
                   , item_condition, num_vars], output)
    #optimizer = optimizers.RMSprop()
    optimizer = optimizers.Adam()
    model.compile(loss=rmsle_tf,#"mean_squared_logarithmic_error", 
                  optimizer=optimizer)
    return model

def eval_model(model):
    val_preds = model.predict(X_valid)
    val_preds = np.expm1(val_preds)
    
    y_true = np.array(dvalid.price.values)
    y_pred = val_preds[:, 0]
    v_rmsle = rmsle(y_true, y_pred)
    print(" RMSLE error on dev test: "+str(v_rmsle))
    return v_rmsle
#fin_lr=init_lr * (1/(1+decay))**(steps-1)
exp_decay = lambda init, fin, steps: (init/fin)**(1/(steps-1)) - 1

print('[{}] Finished DEFINEING MODEL...'.format(time.time() - start_time))

[331.90818977355957] Finished DEFINEING MODEL...


In [16]:
gc.collect()
#FITTING THE MODEL
epochs = 30

BATCH_SIZE = 512 * 60

suffix = str(datetime.now().strftime("%Y-%m-%d-%H-%M"))
NAME = "GRU_" + suffix


steps = int(len(X_train['name'])/BATCH_SIZE) * epochs
lr_init, lr_fin = 0.009, 0.006
lr_decay = exp_decay(lr_init, lr_fin, steps)
log_subdir = '_'.join(['ep', str(epochs),
                    'bs', str(BATCH_SIZE),
                    'lrI', str(lr_init),
                    'lrF', str(lr_fin),
                    'dr', str(dr)])

model = get_model()
K.set_value(model.optimizer.lr, lr_init)
K.set_value(model.optimizer.decay, lr_decay)

In [17]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
brand (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
category (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
item_condition (InputLayer)     (None, 1)            0                                            
__________________________________________________________________________________________________
item_desc (InputLayer)          (None, 60)           0                                            
__________________________________________________________________________________________________
category_n

In [21]:
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

callback_1 = TensorBoard(log_dir='./logs/logs_{}'.format(NAME), histogram_freq=0,
                         write_graph=False, write_images=False)
callback_2 = EarlyStopping(monitor='val_loss', min_delta=0, patience=20, verbose=1, mode='auto')
callback_3 = ModelCheckpoint("models/model_{}.hdf5".format(NAME), monitor='val_loss',
                             save_best_only=True, verbose=0)

In [None]:
#BATCH_SIZE = 512 * 60

history = model.fit(X_train, dtrain.target
                    , epochs=epochs
                    , batch_size=BATCH_SIZE
                    #, validation_split=0.033
                    #, callbacks=[TensorBoard('./logs/'+log_subdir)]
                    , validation_data=(X_valid, dvalid.target)
                    , verbose=1
                    , callbacks=[callback_1,callback_2])
print('[{}] Finished FITTING MODEL...'.format(time.time() - start_time))

Train on 1352724 samples, validate on 47322 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
 153600/1352724 [==>...........................] - ETA: 1:25 - loss: 0.3123

Epoch 12/100
1432425/1432425 [==============================] - 104s 72us/step - loss: 0.3812 - val_loss: 0.4553   -  0.44745

In [None]:
#EVLUEATE THE MODEL ON DEV TEST
v_rmsle = eval_model(model)
print('[{}] Finished predicting valid set...'.format(time.time() - start_time))

v_rmsle

In [None]:
#CREATE PREDICTIONS
preds = model.predict(X_test, batch_size=BATCH_SIZE)
preds = np.expm1(preds)
print('[{}] Finished predicting test set...'.format(time.time() - start_time))
submission = test[["test_id"]][:test_len]
submission["price"] = preds[:test_len]
submission.to_csv("./myNN"+log_subdir+"_{:.6}.csv".format(v_rmsle), index=False)
print('[{}] Finished submission...'.format(time.time() - start_time))