In [1]:
import math
import pickle
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_log_error

from keras import backend as K
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

Using TensorFlow backend.


In [2]:
train = pd.read_csv("/mnt/disks/~/clean.csv")
cloth = train[(train.cat1==554)|(train.cat1==934)]
cloth = cloth.reset_index()
print(cloth.shape)
cloth.head(3)

(758065, 14)


Unnamed: 0,index,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat1,cat2,cat3,cat4,cat5,target
0,0,mlb cincinnati reds t shirt size xl,3,Men/Tops/T-shirts,4786,10.0,1,no description yet,554,859,827,950,950,-0.369464
1,1,ava-viv blouse,1,Women/Tops & Blouses/Blouse,4180,10.0,1,adorable top with a hint of lace and a key hol...,934,860,104,950,950,-0.369464
2,2,24k gold plated rose,1,Women/Jewelry/Necklaces,4786,44.0,0,complete with certificate of authenticity,934,480,584,950,950,0.000978


# Bag of Words
Bag of Words is a model that takes the input text as a set of words regardless of the order, grammer, etc.
We stack the whole text and make a dictionary. 
Each word is indexed by a unique number. 
And text are transformed into frequency of words.

In [3]:
# Train
dic = np.hstack([cloth.item_description.str.lower(), cloth.name.str.lower()])
token = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', lower=True, split=' ')
token.fit_on_texts(dic)

In [4]:
# Transform to sequence
cloth["seq_item_description"] = token.texts_to_sequences(cloth.item_description.str.lower())
cloth["seq_name"] = token.texts_to_sequences(cloth.name.str.lower())

print(cloth["seq_item_description"].shape)
print(cloth["seq_name"].shape)

(758065,)
(758065,)


In [5]:
# Sequence variables
max_name_seq = np.max([np.max(cloth.seq_name.apply(lambda x: len(x)))])
max_seq_item_description = np.max([np.max(cloth.seq_item_description.apply(lambda x: len(x)))])
print("Maximum length of 'name' is %d " % max_name_seq)
print("Maximum length of 'item description' is %d " % max_seq_item_description)

Maximum length of 'name' is 13 
Maximum length of 'item description' is 212 


In [6]:
#SCALE target variable
cloth["target"] = np.log(cloth.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
cloth["target"] = target_scaler.fit_transform(cloth.target.reshape(-1,1))



In [7]:
# Maximum values
# Base on the histograms, we select the next lengths
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 100
MAX_TEXT = np.max([np.max(cloth.seq_name.max()), \
                   np.max(cloth.seq_item_description.max())])+2
MAX_CATEGORY = np.max([np.max(cloth.cat1.max()), \
                       np.max(cloth.cat2.max()), \
                       np.max(cloth.cat3.max())])+3
MAX_BRAND = np.max([cloth.brand_name.max()])+1
MAX_CONDITION = np.max([cloth.item_condition_id.max()])+1

In [19]:
MAX_TEXT

122373

In [8]:
# EXTRACT DEVELOPTMENT TEST
dtrain, dvalid = train_test_split(cloth, random_state=123, train_size=0.99)
print(dtrain.shape)
print(dvalid.shape)

(750484, 16)
(7581, 16)


In [9]:
# Input
def get_keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ)
        ,'brand_name': np.array(dataset.brand_name)
        ,'cat1': np.array(dataset.cat1)
        ,'cat2': np.array(dataset.cat2)
        ,'cat3': np.array(dataset.cat3)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset[["shipping"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)

In [10]:
# Neural Network 
def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

def get_model():

    # hyper parameters
    dr_r = 0.1
    
    # Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    cat1 = Input(shape=[1], name="cat1")
    cat2 = Input(shape=[1], name="cat2")
    cat3 = Input(shape=[1], name="cat3")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    # Embeddings layers
    emb_name = Embedding(MAX_TEXT, 50)(name)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_cat1 = Embedding(MAX_CATEGORY, 10)(cat1)
    emb_cat2 = Embedding(MAX_CATEGORY, 10)(cat2)
    emb_cat3 = Embedding(MAX_CATEGORY, 10)(cat3)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    # rnn layer
    rnn_layer1 = GRU(16) (emb_item_desc)
    rnn_layer2 = GRU(8) (emb_name)
    
    # main layer
    main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_cat1)
        , Flatten() (emb_cat2)
        , Flatten() (emb_cat3)
        , Flatten() (emb_item_condition)
        , rnn_layer1
        , rnn_layer2
        , num_vars
    ])
    main_l = Dropout(dr_r) (Dense(128) (main_l))
    main_l = Dropout(dr_r) (Dense(64) (main_l))
    
    # output
    output = Dense(1, activation="linear") (main_l)
    
    # model
    model = Model([name, item_desc, brand_name
                   , cat1, cat2, cat3, item_condition, num_vars], output)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", rmsle_cust])
    
    return model

    
model = get_model()
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
brand_name (InputLayer)         (None, 1)            0                                            
__________________________________________________________________________________________________
cat1 (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
cat2 (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
cat3 (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
item_condi

In [12]:
BATCH_SIZE = 20000
epochs = 5

model = get_model()
model.fit(X_train, dtrain.target, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, dvalid.target)
          , verbose=1)

Train on 750484 samples, validate on 7581 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f363a89d6d8>

In [13]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5
# Source: https://www.kaggle.com/marknagelberg/rmsle-function

val_preds = model.predict(X_valid)
val_preds = target_scaler.inverse_transform(val_preds)
val_preds = np.exp(val_preds)+1

y_true = np.array(dvalid.price.values)
y_pred = val_preds[:,0]
v_rmsle = rmsle(y_true, y_pred)

# print("RMSLE of training: ", score_train)
print("RMSLE of testing: ", v_rmsle)

RMSLE of testing:  0.46271456418866197


In [15]:
BATCH_SIZE = 10000
epochs = 10

model2 = get_model()
model2.fit(X_train, dtrain.target, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, dvalid.target)
          , verbose=1)

Train on 750484 samples, validate on 7581 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f36280c1a58>

In [16]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5
# Source: https://www.kaggle.com/marknagelberg/rmsle-function

val_preds = model2.predict(X_valid)
val_preds = target_scaler.inverse_transform(val_preds)
val_preds = np.exp(val_preds)+1

y_true = np.array(dvalid.price.values)
y_pred = val_preds[:,0]
v_rmsle = rmsle(y_true, y_pred)

# print("RMSLE of training: ", score_train)
print("RMSLE of testing: ", v_rmsle)

RMSLE of testing:  0.4406392452418835


# Save the model 

In [21]:
model2.save('/mnt/disks/~/model2.h5')