In [1]:
import re
import math
import pickle
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_log_error

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

from keras import backend as K
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping

Using TensorFlow backend.


In [2]:
train = pd.read_csv("/mnt/disks/~/clean.csv")
cloth = train[(train.cat1==554)|(train.cat1==934)]
print(cloth.shape)
cloth.head(3)

(758065, 13)


Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,cat1,cat2,cat3,cat4,cat5,target
0,mlb cincinnati reds t shirt size xl,3,Men/Tops/T-shirts,4786,10.0,1,no description yet,554,859,827,950,950,-0.369464
1,ava-viv blouse,1,Women/Tops & Blouses/Blouse,4180,10.0,1,adorable top with a hint of lace and a key hol...,934,860,104,950,950,-0.369464
2,24k gold plated rose,1,Women/Jewelry/Necklaces,4786,44.0,0,complete with certificate of authenticity,934,480,584,950,950,0.000978


In [3]:
#SCALE target variable
cloth["target"] = np.log(cloth.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
cloth["target"] = target_scaler.fit_transform(cloth.target.reshape(-1,1))



# TF-IDF
TF-IDF computes each word's contribution to a document

In [4]:
tfidf_item_des = TfidfVectorizer(max_df=0.95, min_df=2, max_features=100, stop_words='english')

tfidf_item_desc = tfidf_item_des.fit_transform(cloth['item_description'])

In [None]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=20, random_state=42)
svd_tfidf = svd.fit_transform(tfidf_item_desc)

In [None]:
from sklearn.manifold import TSNE
svd_tfidf = svd_tfidf[:100000]
tsne_model = TSNE(n_components=2, verbose=1, random_state=123)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 100000 samples in 0.347s...


In [None]:
cloth = cloth[:100000]
t = pd.DataFrame(tsne_tfidf,columns=['t1','t2'])
cloth = pd.concat([cloth,t],axis=1)
cloth.head()

In [None]:
# Maximum values
# Base on the histograms, we select the next lengths
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 100
MAX_TEXT = 250
MAX_CATEGORY = np.max([np.max(cloth.cat1.max()), \
                       np.max(cloth.cat2.max()), \
                       np.max(cloth.cat3.max())])+3
MAX_BRAND = np.max([cloth.brand_name.max()])+1
MAX_CONDITION = np.max([cloth.item_condition_id.max()])+1

In [None]:
# EXTRACT DEVELOPTMENT TEST
dtrain, dvalid = train_test_split(cloth, random_state=123, train_size=0.99)
print(dtrain.shape)
print(dvalid.shape)

In [None]:
# Input
def get_keras_data(dataset):
    X = {
        'brand_name': np.array(dataset.brand_name)
        ,'cat1': np.array(dataset.cat1)
        ,'cat2': np.array(dataset.cat2)
        ,'cat3': np.array(dataset.cat3)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset[["shipping"]])
        ,'t1': np.array(dataset[["t1"]])
        ,'t2': np.array(dataset[["t2"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)

In [None]:
# Neural Network 
def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]

def rmsle_cust(y_true, y_pred):
    first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.)
    second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.)
    return K.sqrt(K.mean(K.square(first_log - second_log), axis=-1))

def get_model():

    # hyper parameters
    dr_r = 0.1
    
    # Inputs
    brand_name = Input(shape=[1], name="brand_name")
    cat1 = Input(shape=[1], name="cat1")
    cat2 = Input(shape=[1], name="cat2")
    cat3 = Input(shape=[1], name="cat3")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    t1 = Input(shape=[1], name="t1")
    t2 = Input(shape=[1], name="t2")
    
    # Embeddings layers
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_cat1 = Embedding(MAX_CATEGORY, 10)(cat1)
    emb_cat2 = Embedding(MAX_CATEGORY, 10)(cat2)
    emb_cat3 = Embedding(MAX_CATEGORY, 10)(cat3)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    emb_t1 = Embedding(MAX_CATEGORY, 10)(t1)
    emb_t2 = Embedding(MAX_CATEGORY, 10)(t2)
    
    # main layer
    main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_cat1)
        , Flatten() (emb_cat2)
        , Flatten() (emb_cat3)
        , Flatten() (emb_item_condition)
        , num_vars
    ])
    main_l = Dropout(dr_r) (Dense(128) (main_l))
    main_l = Dropout(dr_r) (Dense(64) (main_l))
    
    # output
    output = Dense(1, activation="linear") (main_l)
    
    # model
    model = Model([brand_name, cat1, cat2, cat3, item_condition, \
                   num_vars, t1, t2], output)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", rmsle_cust])
    
    return model

    
model = get_model()
model.summary()

In [None]:
BATCH_SIZE = 20000
epochs = 5

model = get_model()
model.fit(X_train, dtrain.target, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, dvalid.target)
          , verbose=1)

In [21]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(to_sum) * (1.0/len(y))) ** 0.5
# Source: https://www.kaggle.com/marknagelberg/rmsle-function

val_preds = model.predict(X_valid)
val_preds = target_scaler.inverse_transform(val_preds)
val_preds = np.exp(val_preds)+1

y_true = np.array(dvalid.price.values)
y_pred = val_preds[:,0]
v_rmsle = rmsle(y_true, y_pred)

# print("RMSLE of training: ", score_train)
print("RMSLE of testing: ", v_rmsle)

RMSLE of testing:  0.9320140635954157
