In [1]:
import numpy as np 
import pandas as pd 

In [2]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)

from pprint import pprint

# Look at parameters used by our current forest
print('Parameters currently in use:\n')
pprint(rf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


In [3]:
# try adjusting the following set of hyperparameters:

#     n_estimators = number of trees in the foreset
#     max_features = max number of features considered for splitting a node
#     max_depth = max number of levels in each decision tree
#     min_samples_split = min number of data points placed in a node before the node is split
#     min_samples_leaf = min number of data points allowed in a leaf node
#     bootstrap = method for sampling data points (with or without replacement)


In [4]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [5]:
from datetime import datetime
parser = lambda date: pd.to_datetime(date, format='%d.%m.%Y')
train = pd.read_csv('processed_train.csv',parse_dates=['date'], date_parser=parser)
test  = pd.read_csv('processed_test.csv',parse_dates=['date'], date_parser=parser)

In [6]:
#features
df = train
df['day']=df['date'].apply(lambda x: x.strftime('%d'))
df['day']=df['day'].astype('int64')
df['month']=df['date'].apply(lambda x: x.strftime('%m'))
df['month']=df['month'].astype('int64')
df['year']=df['date'].apply(lambda x: x.strftime('%Y'))
df['year']=df['year'].astype('int64')
df = df[['day','month','year','item_id', 'shop_id','item_price','item_cnt_day']]
#     df = df.pivot_table(index=['item_id', 'shop_id'], columns='date', values='item_cnt_day', fill_value=0).reset_index()
#     count=df.iloc[:,2:]
#     df['total']=count.sum(axis=1)
data=df
#data = pd.merge(val, df, on=['item_id', 'shop_id'], how='left').fillna(0)
data['item_id'] = np.log1p(data['item_id'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [7]:
train_date_info = data

y_train_normal = train_date_info['item_cnt_day']
x_train_normal = train_date_info.drop(labels=['item_cnt_day'], axis=1)

x_train_val = x_train_normal[-100:]
y_train_val = y_train_normal[-100:]

In [None]:
import time
ts = time.time()
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor(n_estimators=100,max_depth=20,random_state=50)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, cv = 3, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf.fit(x_train_normal, y_train_normal)
# reg = ExtraTreesRegressor(n_estimators=512, max_depth=20,random_state=50)
# reg.fit(x_train, y_train)
y_pre = rf.predict(x_train_normal)
score = np.sqrt(mean_squared_error(y_train_normal, y_pre))
print('RMSE cliped:', np.sqrt(mean_squared_error(y_train_normal.clip(0., 20.), y_pre.clip(0., 20.))))
time.time() - ts

In [None]:
def pre_data(data_type, reg, x_test):
    if reg is None:
        reg = joblib.load('%s/%s_model_weight.model' % (out_path, data_type))
    y_pre = reg.predict(x_test)
    return y_pre

In [None]:
from sklearn import ensemble

#reg = ensemble.ExtraTreesRegressor(n_estimators=25, n_jobs=-1, max_depth=15, random_state=42)
reg = ensemble.GradientBoostingRegressor(n_estimators=1000, learning_rate=0.05, max_depth=3, 
                                            max_features='sqrt', loss='huber', random_state=42)
#reg = xgb.XGBRegressor(n_estimators=1000, max_depth=4, learning_rate=0.05, subsample=0.6, colsample_bytree=0.6)
#reg = xgb.XGBRegressor(n_estimators=25, max_depth=12, learning_rate=0.1, subsample=1, colsample_bytree=0.9, random_state=42, eval_metric='rmse')

In [None]:
# x1 = x_train_normal[x_train_normal['date_block_num'] < 33]
# y1 = x1['item_cnt']
# x1 = x1.drop(['item_cnt'], axis=1)

# x2 = x_train_normal[x_train_normal['date_block_num'] == 33]
# y2 = x2['item_cnt']
# x2 = x2.drop(['item_cnt'], axis=1)

reg.fit(x_train_normal, y_train_normal)
pred_cnt = reg.predict(x_train_normal)
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train_normal.clip(0.,20.), pred_cnt.clip(0.,20.))))

In [None]:
df = test
df['day']=df['date'].apply(lambda x:x.strftime('%d'))
df['day']=df['day'].astype('int64')
df['month']=df['date'].apply(lambda x: x.strftime('%m'))
df['month']=df['month'].astype('int64')
df['year']=df['date'].apply(lambda x: x.strftime('%Y'))
df['year']=df['year'].astype('int64')
df = df[['day','month','year','item_id', 'shop_id','item_price']]
data=df
#data = pd.merge(val, df, on=['item_id', 'shop_id'], how='left').fillna(0)
data['item_id'] = np.log1p(data['item_id'])
test_x = data
# test_x.columns = np.append(['shop_id', 'item_id'],np.arange(0, 36, 1))
test_y_1 = pre_data('Tree_Regressor',rf, test_x)
# test_y_2 = pre_data('light_gbm', light_gbm_model, test_x)
# test_y_3 = pre_data('linear', linear_model, test_x)
# test_y_4 = pre_data('xgb', xgb_model, test_x)
test_y = test_y_1
test.head()

In [None]:
test['item_cnt_day'] = test_y
test[['ID', 'item_cnt_day']].to_csv('submission_Tree.csv', index=False)