In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import sklearn
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.externals import joblib
from keras import Sequential
from keras.layers import LSTM, Dropout, Dense
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing 
from xgboost import XGBRegressor, plot_importance
from matplotlib import pyplot
import time

Using TensorFlow backend.


In [2]:
from datetime import datetime
parser = lambda date: pd.to_datetime(date, format='%d.%m.%Y')
train = pd.read_csv('processed_train.csv',parse_dates=['date'], date_parser=parser)
test  = pd.read_csv('processed_test.csv',parse_dates=['date'], date_parser=parser)

In [3]:
#features
df = train
df['day']=df['date'].apply(lambda x: x.strftime('%d'))
df['day']=df['day'].astype('int64')
df['month']=df['date'].apply(lambda x: x.strftime('%m'))
df['month']=df['month'].astype('int64')
df['year']=df['date'].apply(lambda x: x.strftime('%Y'))
df['year']=df['year'].astype('int64')
df = df[['day','month','year','item_id', 'shop_id','item_price','item_cnt_day']]
#     df = df.pivot_table(index=['item_id', 'shop_id'], columns='date', values='item_cnt_day', fill_value=0).reset_index()
#     count=df.iloc[:,2:]
#     df['total']=count.sum(axis=1)
data=df
#data = pd.merge(val, df, on=['item_id', 'shop_id'], how='left').fillna(0)
data['item_id'] = np.log1p(data['item_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [4]:
train_date_info = data

In [5]:
y_train_normal = train_date_info['item_cnt_day']
x_train_normal = train_date_info.drop(labels=['item_cnt_day'], axis=1)

In [6]:
x_train_val = x_train_normal[-100:]
y_train_val = y_train_normal[-100:]

In [7]:
def rmse(predictions, targets):
    return np.sqrt(np.mean((predictions - targets) ** 2))

In [8]:
def TreeRegressor_model(x_train, y_train):
    reg = ExtraTreesRegressor(n_estimators=512, max_depth=20,random_state=50)
    reg.fit(x_train, y_train)
    y_pre = reg.predict(x_train)
    score = np.sqrt(mean_squared_error(y_train, y_pre))
    print('RMSE cliped:', np.sqrt(mean_squared_error(y_train.clip(0., 20.), y_pre.clip(0., 20.))))
    return reg

In [9]:
def light_gbm_model(x_train, y_train):
    lgb_params = {
        'feature_fraction': 1,
        'metric': 'rmse',
        'min_data_in_leaf': 16,
        'bagging_fraction': 0.85,
        'learning_rate': 0.03,
        'objective': 'mse',
        'bagging_seed': 2 ** 7,
        'num_leaves': 32,
        'bagging_freq': 3,
        'verbose': 0
    }
    estimator = lgb.train(lgb_params, lgb.Dataset(x_train, label=y_train), 300)
    y_pre = estimator.predict(x_train)
    print('RMSE cliped:', np.sqrt(mean_squared_error(y_train.clip(0., 20.), y_pre.clip(0., 20.))))
    return estimator

In [10]:
def linear_model(x_train, y_train):
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    y_pre = lr.predict(x_train)
    print('RMSE cliped:', np.sqrt(mean_squared_error(y_train.clip(0., 20.), y_pre.clip(0., 20.))))
    return lr

In [11]:
def xgb_model(x_train, y_train, x_train_val, y_train_val):
    model = XGBRegressor(
        max_depth=8,
        n_estimators=1000,
        min_child_weight=300,
        colsample_bytree=0.9,
        subsample=0.9,
        eta=0.15,
        seed=42)
    model.fit(
        x_train,
        y_train,
        eval_metric="rmse",
        eval_set=[(x_train, y_train), (x_train_val, y_train_val)],
        verbose=True,
        early_stopping_rounds=10)
    y_pre = model.predict(x_train)
    print('RMSE cliped:', np.sqrt(mean_squared_error(y_train.clip(0., 20.), y_pre.clip(0., 20.))))
    plot_importance(model)
    pyplot.show()
    return model

In [12]:
def pre_data(data_type, reg, x_test):
    if reg is None:
        reg = joblib.load('%s/%s_model_weight.model' % (out_path, data_type))
    y_pre = reg.predict(x_test)
    return y_pre

In [None]:
ts = time.time()
xgb_model = xgb_model(x_train_normal[:-100], y_train_normal[:-100], x_train_val, y_train_val)
# linear_model = linear_model(x_train_normal, y_train_normal)
#light_gbm_model = light_gbm_model(x_train_normal, y_train_normal)
# tree_model = TreeRegressor_model(x_train_normal, y_train_normal)
time.time() - ts

[0]	validation_0-rmse:1.28143	validation_1-rmse:1.20297
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
[1]	validation_0-rmse:1.2261	validation_1-rmse:1.15366
[2]	validation_0-rmse:1.17884	validation_1-rmse:1.11231
[3]	validation_0-rmse:1.13869	validation_1-rmse:1.0778
[4]	validation_0-rmse:1.10398	validation_1-rmse:1.05168
[5]	validation_0-rmse:1.0757	validation_1-rmse:1.02322
[6]	validation_0-rmse:1.05188	validation_1-rmse:1.00397
[7]	validation_0-rmse:1.02999	validation_1-rmse:0.987273


In [None]:
test.head()

In [None]:
df = test
df['day']=df['date'].apply(lambda x:x.strftime('%d'))
df['day']=df['day'].astype('int64')
df['month']=df['date'].apply(lambda x: x.strftime('%m'))
df['month']=df['month'].astype('int64')
df['year']=df['date'].apply(lambda x: x.strftime('%Y'))
df['year']=df['year'].astype('int64')
df = df[['day','month','year','item_id', 'shop_id','item_price']]
data=df
#data = pd.merge(val, df, on=['item_id', 'shop_id'], how='left').fillna(0)
data['item_id'] = np.log1p(data['item_id'])
test_x = data
# test_x.columns = np.append(['shop_id', 'item_id'],np.arange(0, 36, 1))
# test_y_1 = pre_data('normal', tree_model, test_x)
# test_y_2 = pre_data('light_gbm', light_gbm_model, test_x)
# test_y_3 = pre_data('linear', linear_model, test_x)
test_y_4 = pre_data('xgb', xgb_model, test_x)
test_y = test_y_4
test.head()

In [None]:
test['item_cnt_day'] = test_y
test[['ID', 'item_cnt_day']].to_csv('submission_XGBoost.csv', index=False)