In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tqdm
from sklearn.model_selection import StratifiedShuffleSplit
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import xgbfir
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import explained_variance_score

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

#устраним ошибки со шрифтами
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['DejaVu Sans']

In [2]:
data = pd.read_csv('houses.csv')

In [3]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [4]:
X = data.drop(['SalePrice'], axis=1)

In [5]:
Y = data['SalePrice']

In [6]:
d_train, d_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [7]:
cat_feat = list(X.dtypes[X.dtypes == object].index)

#закодируем пропущенные значений строкой, факт пропущенного значения тоже может нести в себе информацию
X[cat_feat] = X[cat_feat].fillna('nan')

#отфильтруем непрерывные признаки
num_feat = [f for f in X if f not in (cat_feat + ['ID', 'target'])]

cat_nunique = d_train[cat_feat].nunique()
print(cat_nunique)
cat_feat = list(cat_nunique[cat_nunique < 30].index)

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        5
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          6
Exterior1st      15
Exterior2nd      16
MasVnrType        4
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        6
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            2
Fence             4
MiscFeature       3
SaleType          9
SaleCondition     6
dtype: int64


In [8]:
print(d_train.shape)
print(d_test.shape)
print(y_train.shape)
print(y_test.shape)

(1022, 80)
(438, 80)
(1022,)
(438,)


In [9]:
dummy_train = pd.get_dummies(d_train[cat_feat], columns=cat_feat)
dummy_test = pd.get_dummies(d_test[cat_feat], columns=cat_feat)

In [10]:
print(dummy_train.shape)
print(dummy_test.shape)
print(y_train.shape)
print(y_test.shape)

(1022, 244)
(438, 228)
(1022,)
(438,)


In [11]:
dummy_cols = list(set(dummy_train) & set(dummy_test))

In [12]:
dummy_train = dummy_train[dummy_cols]
dummy_test = dummy_test[dummy_cols]

In [13]:
X_train = pd.concat([d_train[num_feat].fillna(-999),
                     dummy_train], axis=1)

X_test = pd.concat([d_test[num_feat].fillna(-999),
                     dummy_test], axis=1)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1022, 257)
(438, 257)
(1022,)
(438,)


In [15]:
def calc_auc(y, y_pred, plot_label='', prin=True):
    fpr, tpr, _ = roc_curve(y, y_pred)
    auc_val = auc(fpr, tpr)
    if prin:
        print('ROC AUC: {0:.4f}'.format(auc_val))
    if plot_label:
        plt.plot(fpr, tpr, label=plot_label)
        plt.xlabel('FPR')
        plt.ylabel('TPR')
    return auc_val

In [16]:
X_train.tail(20).sort_index()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,MSZoning_RM,GarageFinish_RFn,GarageType_Basment,SaleCondition_AdjLand,Foundation_Stone,Electrical_FuseP,Exterior1st_AsbShng,Exterior1st_CemntBd,Exterior2nd_BrkFace,PoolQC_Gd
87,88,160,40.0,3951,6,5,2009,2009,76.0,0,...,0,1,0,0,0,0,0,0,0,0
174,175,20,47.0,12416,6,5,1986,1986,132.0,1398,...,0,0,0,0,0,0,0,0,0,0
277,278,20,140.0,19138,4,5,1951,1951,0.0,120,...,0,0,0,0,0,0,0,0,0,0
314,315,70,60.0,9600,7,7,1925,1990,0.0,16,...,1,0,0,0,0,0,0,0,0,0
537,538,20,-999.0,12735,4,5,1972,1972,0.0,600,...,0,0,0,0,0,0,0,0,0,0
551,552,20,50.0,6000,5,6,1957,1957,0.0,308,...,1,0,0,0,0,0,0,0,1,0
559,560,120,-999.0,3196,7,5,2003,2004,18.0,0,...,0,0,0,0,0,0,0,0,0,0
599,600,160,24.0,1950,6,6,1980,1980,0.0,81,...,1,0,0,0,0,0,0,0,0,0
600,601,60,74.0,10927,8,5,2005,2005,280.0,546,...,0,0,0,0,0,0,0,0,0,0
684,685,60,58.0,16770,7,5,1998,1998,30.0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y_train.tail(20).sort_index()

87      164500
174     184000
277     141000
314     178000
537     111250
551     112500
559     234000
599     151000
600     275000
684     221000
705      55000
763     337000
835     128000
845     171000
849     187000
1033    230000
1094    129000
1216    112000
1383    112000
1420    179900
Name: SalePrice, dtype: int64

In [18]:
X_test.tail(20).sort_index()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,MSZoning_RM,GarageFinish_RFn,GarageType_Basment,SaleCondition_AdjLand,Foundation_Stone,Electrical_FuseP,Exterior1st_AsbShng,Exterior1st_CemntBd,Exterior2nd_BrkFace,PoolQC_Gd
10,11,20,70.0,11200,5,5,1965,1965,0.0,906,...,0,0,0,0,0,0,0,0,0,0
70,71,20,95.0,13651,7,6,1973,1973,1115.0,1880,...,0,0,0,0,0,0,0,0,0,0
195,196,160,24.0,2280,6,6,1976,1976,0.0,566,...,0,0,0,0,0,0,0,0,0,0
271,272,20,73.0,39104,7,7,1954,2005,0.0,226,...,0,0,0,0,0,0,0,0,0,0
320,321,60,111.0,16259,9,5,2006,2006,370.0,0,...,0,1,0,0,0,0,0,0,0,0
427,428,20,77.0,8593,4,6,1957,1957,0.0,288,...,0,0,0,0,0,0,0,0,0,0
445,446,20,73.0,9855,6,5,1956,1956,0.0,0,...,0,0,0,0,0,0,0,0,0,0
654,655,20,91.0,10437,8,6,1995,1995,660.0,1696,...,0,0,0,0,0,0,0,0,0,0
686,687,60,84.0,10207,7,6,2007,2007,0.0,0,...,0,0,0,0,0,0,0,0,0,0
692,693,60,42.0,26178,7,5,1989,1990,293.0,965,...,0,1,0,0,0,0,0,0,0,0


In [19]:
X_test.shape

(438, 257)

In [20]:
y_test.shape

(438,)

In [21]:
X_train.shape

(1022, 257)

In [22]:
y_train.shape

(1022,)

In [144]:
params = {'n_estimators': 100,
          'learning_rate': 0.1,
          'max_depth': 3,
          'min_child_weight': 1,
          'subsample': 1,
          'colsample_bytree': 1,
          'n_jobs': 4, 'objective': 'reg:linear'}
clf_xgb = xgb.XGBClassifier(**params)

clf_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=4, nthread=None, objective='multi:softprob', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [150]:
y_pred_xgb_test = clf_xgb.predict(X_test)

In [151]:
y_pred_xgb_test

array([268000, 152000, 140000, 149500, 112000, 138000, 153500, 178000,
       310000, 140000, 179900, 175500, 173000, 125000, 113000, 153000,
       260000,  93500, 142000, 149000, 140000, 144000, 129000, 189000,
       193000, 122000, 172500, 151000, 325000, 135000, 145000, 187000,
       144000, 226000, 395000, 180000, 226000, 129000, 236000, 244000,
       235000, 163000, 181134, 466500, 315000, 190000, 134900, 130000,
       139000, 108000, 214000, 140000, 175500,  84500, 260000, 110000,
       116000, 275000, 155000, 100000, 140000, 135000, 148000, 143000,
       173000, 159000, 135000, 212000, 117000, 222000, 176000, 136500,
       139000, 225000, 108000, 318000, 136500, 178000, 340000, 130000,
       147000, 130000, 129900, 156000, 135000, 224900, 117000, 202500,
       140000, 140000, 188000, 280000, 185000, 231500, 164500, 143000,
       213500, 180500, 120500, 100000, 194500, 219500,  89500, 148500,
       110000, 225000, 180000, 137000, 180000,  96500,  83000, 157900,
      

In [168]:
explained_variance_score(y_test, y_pred_xgb_test)

0.54739486037785645

In [23]:
def score(params):
    params['max_depth'] = int(params['max_depth'])
    params['n_jobs'] = 4
    print("Training with params : ", params)
    clf = xgb.XGBClassifier(**params)
    clf.fit(X_train, y_train)
    y_pred_xgb_test = clf.predict(X_test)
    score = explained_variance_score(y_test, y_pred_xgb_test)
    result = {'loss': 1-score, 'status': STATUS_OK}
    print('TEST EXPLAINED VARIANCE SCORE: {0:.4f}'.format(score))
    return result



space = {'max_depth' : hp.quniform('max_depth', 1, 10, 1),
         'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
         'subsample' : hp.quniform('subsample', 0.5, 1, 0.05),
         'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
         'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05),
         'silent' : 1,
         'n_estimators': 100,
         'learning_rate': 0.01,
         'objective': 'reg:linear'
         }
trials = Trials()

best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=100)

Training with params :  {'colsample_bytree': 0.9, 'gamma': 0.9500000000000001, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 6.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.8500000000000001, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.7000000000000001, 'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 8.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.5, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.9500000000000001, 'gamma': 0.9500000000000001, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 8.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.8, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.6000000000000001, 'gamma': 0.9, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 8.0, 'n_estim

TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.9500000000000001, 'gamma': 0.6000000000000001, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 3.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.9500000000000001, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.3025
Training with params :  {'colsample_bytree': 0.8, 'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 10.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 1.0, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.7000000000000001, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 2.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.9, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.4161
Training with params :  {'colsample_bytree': 0.9, 'gamma': 0.65, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 7.0

TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.8, 'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 2, 'min_child_weight': 3.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.65, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.65, 'gamma': 0.7000000000000001, 'learning_rate': 0.01, 'max_depth': 4, 'min_child_weight': 2.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.75, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.3867
Training with params :  {'colsample_bytree': 0.8500000000000001, 'gamma': 0.75, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 8.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.5, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.9500000000000001, 'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 1.0, 'n_estimato

TEST EXPLAINED VARIANCE SCORE: 0.4360
Training with params :  {'colsample_bytree': 0.9, 'gamma': 0.8500000000000001, 'learning_rate': 0.01, 'max_depth': 1, 'min_child_weight': 2.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.9500000000000001, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.4024
Training with params :  {'colsample_bytree': 0.8, 'gamma': 0.7000000000000001, 'learning_rate': 0.01, 'max_depth': 7, 'min_child_weight': 7.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.8500000000000001, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.7000000000000001, 'gamma': 0.75, 'learning_rate': 0.01, 'max_depth': 5, 'min_child_weight': 4.0, 'n_estimators': 100, 'objective': 'reg:linear', 'silent': 1, 'subsample': 1.0, 'n_jobs': 4}
TEST EXPLAINED VARIANCE SCORE: 0.0000
Training with params :  {'colsample_bytree': 0.75, 'gamma': 0.6000000000000001, 'learning_rate': 0.01, 'max_dept

In [24]:
best

{'colsample_bytree': 0.8,
 'gamma': 0.65,
 'max_depth': 7.0,
 'min_child_weight': 1.0,
 'subsample': 0.9500000000000001}

In [40]:
params = {'n_estimators': 100,
          'learning_rate': 0.1,
          'max_depth': 7,
          'min_child_weight': 1.0,
          'subsample': 0.9500000000000001,
          'colsample_bytree': 0.8,
          'n_jobs': 4,
          'gamma': 0.65,
          'objective': 'reg:linear'}
clf_xgb = xgb.XGBClassifier(**params)

clf_xgb.fit(X_train, y_train, eval_metric='rmse')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.65, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=1.0, missing=None,
       n_estimators=100, n_jobs=4, nthread=None,
       objective='multi:softprob', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.9500000000000001)

In [34]:
from sklearn.model_selection import KFold

In [36]:
kf = KFold(n_splits=10)

In [38]:
for train_index, test_index in kf.split(X):
      print("Train:", train_index, "Test:", test_index)
      X_train, X_test = X[train_index], X[test_index] 
      y_train, y_test = y[train_index], y[test_index]

Train: [ 146  147  148 ..., 1457 1458 1459] Test: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145]


KeyError: '[ 146  147  148 ..., 1457 1458 1459] not in index'

In [41]:
import xgbfir
xgbfir.saveXgbFI(clf_xgb, OutputXlsxFile='xgbfi_report.xlsx')

In [42]:
pd.read_excel('xgbfi_report.xlsx', sheet_name=0)

Unnamed: 0,Interaction,Gain,FScore,wFScore,Average wFScore,Average Gain,Expected Gain,Gain Rank,FScore Rank,wFScore Rank,Avg wFScore Rank,Avg Gain Rank,Expected Gain Rank,Average Rank,Average Tree Index,Average Tree Depth
0,GrLivArea,30.89848,8,8,1,3.86231,30.89848,1,1,1,1,6,1,1.833333,51.25,0
1,YearBuilt,21.82065,5,5,1,4.36413,21.82065,2,2,2,2,5,2,2.5,55.2,0
2,GarageArea,18.30627,4,4,1,4.576568,18.30627,3,3,3,3,4,3,3.166667,82.25,0
3,1stFlrSF,16.08063,2,2,1,8.040315,16.08063,4,7,7,4,3,4,4.833333,50.5,0
4,OpenPorchSF,11.5249,1,1,1,11.5249,11.5249,5,9,9,5,1,5,5.666667,56.0,0
5,TotalBsmtSF,10.041,3,3,1,3.347,10.041,6,6,6,6,7,6,6.166667,56.333333,0
6,FullBath,9.24341,1,1,1,9.24341,9.24341,7,10,10,7,2,7,7.166667,84.0,0
7,LotArea,8.04916,4,4,1,2.01229,8.04916,8,4,4,8,9,8,6.833333,49.0,0
8,Id,6.37231,4,4,1,1.593077,6.37231,9,5,5,9,12,9,8.166667,51.25,0
9,YearRemodAdd,4.01861,2,2,1,2.009305,4.01861,10,8,8,10,10,10,9.333333,78.0,0


In [43]:
pd.read_excel('xgbfi_report.xlsx', sheet_name=1)

Unnamed: 0,Interaction,Sum Leaf Values Left,Sum Leaf Values Right,Sum Leaf Covers Left,Sum Leaf Covers Right
0,GrLivArea,1.029002,-0.290038,8.14632,20.96053
1,YearBuilt,0.715596,-0.181372,5.09192,13.12565
2,GarageArea,0.598736,-0.14545,4.01124,10.59284
3,1stFlrSF,0.381388,-0.072342,2.10506,5.19698
4,OpenPorchSF,0.268767,-0.03352,1.66604,2.02251
5,TotalBsmtSF,0.364688,-0.108623,3.09567,7.82361
6,FullBath,0.236913,-0.033562,1.6135,2.03001
7,LotArea,0.349297,-0.144923,4.00748,10.454
8,Id,0.188212,-0.03409,5.63599,8.90429
9,YearRemodAdd,0.173715,-0.071281,2.35272,4.94182
