In [1]:
import gc
import time
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split, cross_val_score
import lightgbm as lgb

import time

In [2]:
start_time = time.time()
tcurrent   = start_time

#np.random.seed(1313)  # v01
np.random.seed(27)  # v11

NUM_BRANDS = 4100
NUM_CATEGORIES = 1000
NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 51000

In [3]:
def handle_missing_inplace(dataset):
    dataset['category_name'].fillna(value='missing', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='missing', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    pop_category = dataset['category_name'].value_counts().loc[lambda x: x.index != 'missing'].index[:NUM_BRANDS]
    dataset.loc[~dataset['category_name'].isin(pop_category), 'category_name'] = 'missing'


def to_categorical(dataset):
    dataset['category_name'] = dataset['category_name'].astype('category')
    dataset['brand_name'] = dataset['brand_name'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')

In [4]:
start_time = time.time()

train = pd.read_table('data/train.tsv', engine='c')
test = pd.read_table('data/test.tsv', engine='c')
print('[{}] Finished to load data'.format(time.time() - start_time))
print('Train shape: ', train.shape)
print('Test shape: ', test.shape)

nrow_train = train.shape[0]
y = np.log1p(train["price"])
merge = pd.concat([train, test])
submission = test[['test_id']]

del train
del test
gc.collect()

[5.100717306137085] Finished to load data
Train shape:  (1482535, 8)
Test shape:  (693359, 7)


25

In [5]:
handle_missing_inplace(merge)
print('[{}] Finished to handle missing'.format(time.time() - start_time))

cutting(merge)
print('[{}] Finished to cut'.format(time.time() - start_time))

to_categorical(merge)
print('[{}] Finished to convert categorical'.format(time.time() - start_time))

cv = CountVectorizer(min_df=NAME_MIN_DF)
X_name = cv.fit_transform(merge['name'])
print('[{}] Finished count vectorize `name`'.format(time.time() - start_time))

cv = CountVectorizer()
X_category = cv.fit_transform(merge['category_name'])
print('[{}] Finished count vectorize `category_name`'.format(time.time() - start_time))

[5.613053321838379] Finished to handle missing
[6.359637975692749] Finished to cut
[6.7107744216918945] Finished to convert categorical
[16.646504878997803] Finished count vectorize `name`
[25.399235486984253] Finished count vectorize `category_name`


In [6]:
tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                     ngram_range=(1, 3),
                     stop_words='english')
X_description = tv.fit_transform(merge['item_description'])
print('[{}] Finished TFIDF vectorize `item_description`'.format(time.time() - start_time))

lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Finished label binarize `brand_name`'.format(time.time() - start_time))

X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                      sparse=True).values)
print('[{}] Finished to get dummies on `item_condition_id` and `shipping`'.format(time.time() - start_time))

sparse_merge = hstack((X_dummies, X_description, X_brand, X_category, X_name)).tocsr()
print('[{}] Finished to create sparse merge'.format(time.time() - start_time))

X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_train:]

[216.61707043647766] Finished TFIDF vectorize `item_description`
[224.04442834854126] Finished label binarize `brand_name`
[226.82568788528442] Finished to get dummies on `item_condition_id` and `shipping`
[229.50633549690247] Finished to create sparse merge


In [10]:
X_brand

<2175894x4101 sparse matrix of type '<class 'numpy.int64'>'
	with 2175894 stored elements in Compressed Sparse Row format>

In [11]:
# def rmsle(y, y0):
#     assert len(y) == len(y0)
#     return np.sqrt(np.mean(np.power(np.log1p(y)-np.log1p(y0), 2)))

model = Ridge(solver="sag", fit_intercept=True, random_state=205, alpha=3)
model.fit(X, y)
print('[{}] Finished to train ridge sag'.format(time.time() - start_time))
predsR = model.predict(X=X_test)
print('[{}] Finished to predict ridge sag'.format(time.time() - start_time))

[514.9087607860565] Finished to train ridge sag
[514.9510307312012] Finished to predict ridge sag


In [12]:
model = Ridge(solver="lsqr", fit_intercept=True, random_state=145, alpha = 3)
model.fit(X, y)
print('[{}] Finished to train ridge lsqrt'.format(time.time() - start_time))
predsR2 = model.predict(X=X_test)
print('[{}] Finished to predict ridge lsqrt'.format(time.time() - start_time))



[586.71910572052] Finished to train ridge lsqrt
[586.7622191905975] Finished to predict ridge lsqrt


In [18]:
train_X, valid_X, train_y, valid_y = train_test_split(X, y, random_state=666, test_size=0.0338) 
d_train = lgb.Dataset(train_X, label=train_y)#, max_bin=8192)
d_valid = lgb.Dataset(valid_X, label=valid_y)#, max_bin=8192)
watchlist = [d_train, d_valid]

params = {
    'learning_rate': 0.78,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 99,
    'verbosity': -1,
    'metric': 'RMSE',
    'nthread': 4
}

params2 = {
    'learning_rate': 0.88,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 110,
    'verbosity': -1,
    'metric': 'RMSE',
    'nthread': 4
}

model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=500) 
predsL = model.predict(X_test)

print('[{}] Finished to predict lgb 1'.format(time.time() - start_time))

Training until validation scores don't improve for 50 rounds.
[500]	training's rmse: 0.486392	valid_1's rmse: 0.493768
[1000]	training's rmse: 0.466157	valid_1's rmse: 0.47766
[1500]	training's rmse: 0.455084	valid_1's rmse: 0.470909
[2000]	training's rmse: 0.447432	valid_1's rmse: 0.46709
Early stopping, best iteration is:
[2235]	training's rmse: 0.443993	valid_1's rmse: 0.465443
[2507.033341407776] Finished to predict lgb 1


In [19]:
model = lgb.train(params2, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
early_stopping_rounds=50, verbose_eval=500) 
predsL = model.predict(X_test)

Training until validation scores don't improve for 50 rounds.
[500]	training's rmse: 0.483399	valid_1's rmse: 0.490801
[1000]	training's rmse: 0.464132	valid_1's rmse: 0.476371
[1500]	training's rmse: 0.453475	valid_1's rmse: 0.470414
[2000]	training's rmse: 0.44529	valid_1's rmse: 0.466469
[2500]	training's rmse: 0.439088	valid_1's rmse: 0.464459
Early stopping, best iteration is:
[2677]	training's rmse: 0.437254	valid_1's rmse: 0.464101


In [20]:
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size = 0.1, random_state = 101) 
d_train2 = lgb.Dataset(train_X2, label=train_y2)#, max_bin=8192)
d_valid2 = lgb.Dataset(valid_X2, label=valid_y2)#, max_bin=8192)
watchlist2 = [d_train2, d_valid2]

model = lgb.train(params2, train_set=d_train2, num_boost_round=3000, valid_sets=watchlist2, \
early_stopping_rounds=50, verbose_eval=500) 
predsL2 = model.predict(X_test)

print('[{}] Finished to predict lgb 2'.format(time.time() - start_time))

Training until validation scores don't improve for 50 rounds.
[500]	training's rmse: 0.485637	valid_1's rmse: 0.491541
[1000]	training's rmse: 0.465196	valid_1's rmse: 0.476124
[1500]	training's rmse: 0.454091	valid_1's rmse: 0.469819
[2000]	training's rmse: 0.446163	valid_1's rmse: 0.466077
[2500]	training's rmse: 0.44003	valid_1's rmse: 0.464134
[3000]	training's rmse: 0.434971	valid_1's rmse: 0.463006
Did not meet early stopping. Best iteration is:
[3000]	training's rmse: 0.434971	valid_1's rmse: 0.463006
[12445.733229637146] Finished to predict lgb 2


In [None]:
preds = predsR2*0.20 + predsR*0.20 + predsL*0.40 + predsL2*0.20
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_8 20 20 40 20 v11.csv", index=False)

In [None]:
preds = predsR2*0.19 + predsR*0.19 + predsL*0.44 + predsL2*0.18
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_8 19 19 44 18 v1.csv", index=False)

In [None]:
preds = predsR2*0.18 + predsR*0.18 + predsL*0.45 + predsL2*0.19
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_8 18 18 45 19 v11.csv", index=False)

In [None]:
preds = predsR2*0.17 + predsR*0.17 + predsL*0.46 + predsL2*0.20
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_8 17 17 46 20 v11.csv", index=False)

In [None]:
preds = predsR2*0.16 + predsR*0.16 + predsL*0.48 + predsL2*0.20
submission['price'] = np.expm1(preds)
submission.to_csv("submission_lgbm_ridge_8 16 16 48 20 v11.csv", index=False)

In [None]:
nm=(time.time() - start_time)/60
print ("Total time %s min" % nm)