In [40]:
import numpy as np                   # array, vector, matrix calculations
import pandas as pd                  # DataFrame handling
import shap                          # for consistent, signed variable importance measurements
import xgboost as xgb                # gradient boosting machines (GBMs)
from xgboost import XGBRegressor

import matplotlib.pyplot as plt      # plotting
from sklearn.metrics import confusion_matrix
from termcolor import colored
pd.options.display.max_columns = 999 # enable display of all columns in notebook

# enables display of plots in notebook
%matplotlib inline

np.random.seed(12345)   

In [43]:
data = pd.read_csv('pracdata.csv') #import data

In [44]:
data.head()

Unnamed: 0,value_weighted_return_includesdistr,value_weighted_return_excludingdivi,equal_weighted_return_includesdistr,equal_weighted_return_excludingdivi,return_on_sp_composite_index,bm,pe_exi,ps,pcf,npm,opmad,gpm,cfm,roa,roe,aftret_eq,aftret_equity,gprof,debt_invcap,capital_ratio,invt_act,debt_ebitda,profit_lct,ocf_lct,quick_ratio,curr_ratio,at_turn,sale_invcap,rd_sale,ptb,price,sharevolume,numberofsharesoutstanding
0,-0.020873,-0.021981,-0.033876,-0.035031,-0.017741,0.405,-370.0,10.323,213.94099,-0.014,-0.235,0.705,0.037,-0.071,-0.006,-0.006,-0.006,0.273,0.001,0.001,0.032,-0.016,-0.751,0.123,7.305,7.543,0.388,0.43,0.448,4.207,0.568202,36930,30104
1,0.040322,0.03807,0.034319,0.032775,0.035186,0.277,74.0,7.25,67.921997,0.089,-0.114,0.717,0.124,-0.035,0.044,0.044,0.044,0.312,0.001,0.001,0.03,-0.033,-0.357,0.478,6.883,7.096,0.435,0.484,0.398,3.422,0.471292,32458,30106
2,0.040041,0.038935,0.076354,0.07515,0.025467,0.277,118.75,11.637,109.026,0.089,-0.114,0.717,0.124,-0.035,0.044,0.044,0.044,0.312,0.001,0.001,0.03,-0.033,-0.357,0.478,6.883,7.096,0.435,0.484,0.398,5.494,0.676694,121642,30114
3,-0.001639,-0.003553,0.004836,0.003476,0.000453,0.215,111.25,10.294,207.375,0.084,-0.058,0.745,0.112,-0.014,0.043,0.043,0.043,0.338,0.001,0.001,0.038,-0.081,-0.138,0.137,6.661,6.924,0.453,0.507,0.39,5.183,0.64836,63316,30120
4,0.019042,0.017575,0.036978,0.035232,0.011065,0.215,92.5,8.633,269.362,0.084,-0.058,0.745,0.112,-0.014,0.043,0.043,0.043,0.338,0.001,0.001,0.038,-0.081,-0.138,0.137,6.661,6.924,0.453,0.507,0.39,4.346,0.568202,43188,30378


In [46]:
data.columns = data.columns.str.strip() 
x =data.drop('price',axis=1) #define variables and y
X = list(x.columns)
y = 'price'
X

In [47]:
data[X + [y]].describe()

Unnamed: 0,value_weighted_return_includesdistr,value_weighted_return_excludingdivi,equal_weighted_return_includesdistr,equal_weighted_return_excludingdivi,return_on_sp_composite_index,bm,pe_exi,ps,pcf,npm,opmad,gpm,cfm,roa,roe,aftret_eq,aftret_equity,gprof,debt_invcap,capital_ratio,invt_act,debt_ebitda,profit_lct,ocf_lct,quick_ratio,curr_ratio,at_turn,sale_invcap,rd_sale,ptb,sharevolume,numberofsharesoutstanding,price
count,16348.0,16348.0,16348.0,16348.0,16348.0,15716.0,16288.0,16192.0,16328.0,16192.0,16192.0,16130.0,16192.0,16333.0,15758.0,16333.0,16333.0,16341.0,16195.0,16313.0,16329.0,16289.0,16341.0,16328.0,16341.0,16341.0,16184.0,16092.0,16341.0,15716.0,16348.0,16348.0,16348.0
mean,0.007724,0.005917,0.006888,0.005118,0.006202,0.417793,4.645026,18.419953,8.296831,-47.191414,-46.138202,-33.93929,-45.265887,-0.000123,-0.209422,0.417405,0.170386,0.259457,0.246413,0.260622,0.148103,1.013668,0.047561,-0.00746,3.680214,4.145731,0.892621,1.50758,29.644052,5.033,339607.5,220285.0,1.237128
std,0.042575,0.042537,0.050351,0.050308,0.04026,0.395574,99.78553,62.418722,44.174697,843.468187,858.842323,743.03418,814.841909,0.330285,1.967843,27.082047,14.786971,0.467488,1.932785,1.805058,0.157086,39.012155,2.435771,2.031517,3.736306,3.76857,0.864345,3.182188,665.215088,6.327933,870628.9,688669.2,0.635216
min,-0.184648,-0.186136,-0.205222,-0.206835,-0.169425,0.0,-1467.4,0.03,-294.39999,-29319.0,-30176.0,-29326.0,-28422.0,-2.475,-72.163002,-624.03998,-385.64001,-3.315,-129.03,-60.715,0.0,-679.78003,-22.486,-17.74,0.034,0.034,0.0,-128.17,0.0,0.155,81.0,1269.0,-1.100179
25%,-0.013988,-0.015553,-0.020645,-0.02248,-0.01508,0.188,-8.37525,1.237,-4.70175,-0.281,-0.218,0.116,-0.217,-0.112,-0.255,-0.243,-0.241,0.131,0.0,0.0,0.002,0.0,-0.623,-0.434,1.37,1.768,0.395,0.516,0.015,1.96775,29675.5,27470.75,0.768453
50%,0.00947,0.007586,0.008205,0.006786,0.008521,0.311,12.5825,3.265,10.2775,0.03,0.073,0.447,0.073,0.103,0.067,0.077,0.076,0.336,0.141,0.14,0.099,0.226,0.513,0.4,2.291,2.861,0.678,0.8795,0.08,3.262,98347.5,51724.0,1.32025
75%,0.031603,0.030466,0.031907,0.03049,0.030263,0.505,24.630249,8.27925,19.13975,0.116,0.182,0.676,0.173,0.178,0.164,0.179,0.178,0.459,0.344,0.344,0.24,1.742,1.12,0.876,4.538,5.085,1.049,1.47325,0.4,5.51425,281606.0,121439.5,1.727948
max,0.11403,0.112619,0.192762,0.190929,0.107723,6.065,774.66699,854.78302,340.75201,68.515999,0.948,0.993,71.889,1.853,30.809,1575.0699,832.75,3.45,42.476002,85.888,0.676,2328.1001,33.416,20.07,47.34,47.34,5.912,52.202999,25684.4,85.028999,18288410.0,8069536.0,3.002023


In [48]:
pd.DataFrame(data[X + [y]].corr()[y]).iloc[:-1] 

Unnamed: 0,price
value_weighted_return_includesdistr,-0.003887
value_weighted_return_excludingdivi,-0.00352
equal_weighted_return_includesdistr,-0.02052
equal_weighted_return_excludingdivi,-0.02002
return_on_sp_composite_index,0.001725
bm,-0.323862
pe_exi,0.206772
ps,-0.161753
pcf,0.188583
npm,0.045427


In [49]:
mono_constraints = tuple([int(i) for i in np.sign(data[X + [y]].corr()[y].values[:-1])])
mono_constraints

(-1,
 -1,
 -1,
 -1,
 1,
 -1,
 1,
 -1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 -1,
 -1,
 1,
 1,
 1,
 1,
 -1,
 -1,
 1,
 1,
 -1,
 1,
 1,
 1)

In [50]:
np.random.seed(666) # set random seed for reproducibility
split_ratio = 0.8     # 80%/20% train/test split

# execute split
split = np.random.rand(len(data)) < split_ratio
train = data[split]
test = data[~split]

# summarize split
print('Train data rows = %d, columns = %d' % (train.shape[0], train.shape[1]))
print('Test data rows = %d, columns = %d' % (test.shape[0], test.shape[1]))

Train data rows = 13065, columns = 33
Test data rows = 3283, columns = 33


In [51]:
dtrain = xgb.DMatrix(train[X], train[y])
dtest = xgb.DMatrix(test[X], test[y])

In [52]:
base_y = train[y].mean()

# tuning parameters
params = {
    'objective': 'reg:linear', 
    'n_estimators': 500, 
    'learning_rate': 0.05, 
    'min_child_weight' : 4,
    'max_depth': 15,                            # allow decision trees to grow to depth of 15
    'monotone_constraints':mono_constraints,    # 1 = increasing relationship, -1 = decreasing relationship
    'base_score': base_y,                       # calibrate predictions to mean of y 
    'seed': 12345                               # set random seed for reproducibility
}

# watchlist is used for early stopping
watchlist = [(dtrain, 'train'), (dtest, 'eval')]

# train model
xgb_model = xgb.train(params,                   # set tuning parameters from above                   
                      dtrain,                   # training data
                      1000,                     # maximum of 1000 iterations (trees)
                      evals=watchlist,          # use watchlist for early stopping 
                      early_stopping_rounds=50, # stop after 50 iterations (trees) without increase in rmse
                      verbose_eval=True)  

[0]	train-rmse:0.617992	eval-rmse:0.610361
Multiple eval metrics have been passed: 'eval-rmse' will be used for early stopping.

Will train until eval-rmse hasn't improved in 50 rounds.
[1]	train-rmse:0.60091	eval-rmse:0.593808
[2]	train-rmse:0.584513	eval-rmse:0.578144
[3]	train-rmse:0.568951	eval-rmse:0.56276
[4]	train-rmse:0.554386	eval-rmse:0.548936
[5]	train-rmse:0.541104	eval-rmse:0.536078
[6]	train-rmse:0.528004	eval-rmse:0.523651
[7]	train-rmse:0.516088	eval-rmse:0.512095
[8]	train-rmse:0.504519	eval-rmse:0.501095
[9]	train-rmse:0.493738	eval-rmse:0.490928
[10]	train-rmse:0.483524	eval-rmse:0.481042
[11]	train-rmse:0.474224	eval-rmse:0.472235
[12]	train-rmse:0.465092	eval-rmse:0.463558
[13]	train-rmse:0.457148	eval-rmse:0.456143
[14]	train-rmse:0.449411	eval-rmse:0.448875
[15]	train-rmse:0.442313	eval-rmse:0.442203
[16]	train-rmse:0.435404	eval-rmse:0.435616
[17]	train-rmse:0.42917	eval-rmse:0.429657
[18]	train-rmse:0.423221	eval-rmse:0.424196
[19]	train-rmse:0.417201	eval-rmse

[180]	train-rmse:0.272556	eval-rmse:0.293066
[181]	train-rmse:0.272405	eval-rmse:0.292923
[182]	train-rmse:0.272321	eval-rmse:0.292851
[183]	train-rmse:0.272232	eval-rmse:0.292778
[184]	train-rmse:0.272101	eval-rmse:0.292642
[185]	train-rmse:0.271998	eval-rmse:0.292544
[186]	train-rmse:0.27191	eval-rmse:0.292474
[187]	train-rmse:0.271706	eval-rmse:0.292312
[188]	train-rmse:0.271522	eval-rmse:0.292174
[189]	train-rmse:0.271443	eval-rmse:0.292149
[190]	train-rmse:0.271315	eval-rmse:0.29204
[191]	train-rmse:0.271196	eval-rmse:0.291918
[192]	train-rmse:0.271065	eval-rmse:0.291777
[193]	train-rmse:0.270726	eval-rmse:0.2915
[194]	train-rmse:0.270446	eval-rmse:0.291272
[195]	train-rmse:0.270346	eval-rmse:0.291203
[196]	train-rmse:0.270279	eval-rmse:0.291174
[197]	train-rmse:0.270182	eval-rmse:0.29112
[198]	train-rmse:0.270095	eval-rmse:0.291042
[199]	train-rmse:0.269985	eval-rmse:0.290941
[200]	train-rmse:0.269821	eval-rmse:0.290822
[201]	train-rmse:0.269725	eval-rmse:0.290705
[202]	train-rms

[364]	train-rmse:0.25708	eval-rmse:0.280939
[365]	train-rmse:0.257058	eval-rmse:0.280934
[366]	train-rmse:0.256951	eval-rmse:0.28089
[367]	train-rmse:0.256886	eval-rmse:0.280907
[368]	train-rmse:0.256848	eval-rmse:0.280882
[369]	train-rmse:0.256838	eval-rmse:0.280875
[370]	train-rmse:0.256808	eval-rmse:0.280865
[371]	train-rmse:0.256785	eval-rmse:0.280843
[372]	train-rmse:0.256725	eval-rmse:0.280806
[373]	train-rmse:0.25665	eval-rmse:0.280796
[374]	train-rmse:0.256589	eval-rmse:0.280779
[375]	train-rmse:0.256522	eval-rmse:0.280713
[376]	train-rmse:0.256505	eval-rmse:0.280705
[377]	train-rmse:0.256488	eval-rmse:0.280694
[378]	train-rmse:0.256471	eval-rmse:0.280686
[379]	train-rmse:0.25637	eval-rmse:0.28059
[380]	train-rmse:0.256338	eval-rmse:0.280567
[381]	train-rmse:0.256312	eval-rmse:0.280542
[382]	train-rmse:0.256299	eval-rmse:0.280536
[383]	train-rmse:0.256263	eval-rmse:0.280509
[384]	train-rmse:0.256232	eval-rmse:0.280556
[385]	train-rmse:0.256201	eval-rmse:0.280557
[386]	train-rms

[547]	train-rmse:0.250473	eval-rmse:0.277324
[548]	train-rmse:0.250462	eval-rmse:0.277318
[549]	train-rmse:0.250432	eval-rmse:0.277301
[550]	train-rmse:0.250411	eval-rmse:0.277289
[551]	train-rmse:0.250398	eval-rmse:0.277284
[552]	train-rmse:0.250327	eval-rmse:0.277218
[553]	train-rmse:0.250284	eval-rmse:0.277224
[554]	train-rmse:0.25027	eval-rmse:0.277215
[555]	train-rmse:0.250177	eval-rmse:0.277119
[556]	train-rmse:0.250164	eval-rmse:0.277117
[557]	train-rmse:0.25015	eval-rmse:0.277099
[558]	train-rmse:0.250105	eval-rmse:0.277082
[559]	train-rmse:0.250072	eval-rmse:0.277076
[560]	train-rmse:0.250021	eval-rmse:0.277055
[561]	train-rmse:0.249978	eval-rmse:0.27706
[562]	train-rmse:0.249966	eval-rmse:0.277052
[563]	train-rmse:0.249949	eval-rmse:0.277042
[564]	train-rmse:0.249936	eval-rmse:0.277032
[565]	train-rmse:0.249926	eval-rmse:0.277026
[566]	train-rmse:0.249913	eval-rmse:0.277015
[567]	train-rmse:0.249892	eval-rmse:0.277
[568]	train-rmse:0.249853	eval-rmse:0.276994
[569]	train-rmse

[731]	train-rmse:0.246049	eval-rmse:0.275219
[732]	train-rmse:0.246043	eval-rmse:0.275212
[733]	train-rmse:0.246019	eval-rmse:0.275194
[734]	train-rmse:0.246016	eval-rmse:0.275194
[735]	train-rmse:0.245932	eval-rmse:0.275122
[736]	train-rmse:0.245912	eval-rmse:0.275117
[737]	train-rmse:0.245872	eval-rmse:0.275102
[738]	train-rmse:0.245866	eval-rmse:0.275097
[739]	train-rmse:0.245842	eval-rmse:0.275087
[740]	train-rmse:0.245802	eval-rmse:0.275076
[741]	train-rmse:0.245776	eval-rmse:0.275074
[742]	train-rmse:0.245702	eval-rmse:0.275028
[743]	train-rmse:0.245687	eval-rmse:0.275017
[744]	train-rmse:0.245684	eval-rmse:0.275016
[745]	train-rmse:0.245677	eval-rmse:0.275011
[746]	train-rmse:0.24567	eval-rmse:0.275011
[747]	train-rmse:0.245665	eval-rmse:0.275008
[748]	train-rmse:0.245656	eval-rmse:0.27501
[749]	train-rmse:0.245636	eval-rmse:0.274987
[750]	train-rmse:0.245566	eval-rmse:0.274906
[751]	train-rmse:0.245217	eval-rmse:0.274656
[752]	train-rmse:0.245167	eval-rmse:0.274648
[753]	train-

[914]	train-rmse:0.24169	eval-rmse:0.272849
[915]	train-rmse:0.241686	eval-rmse:0.272846
[916]	train-rmse:0.241674	eval-rmse:0.27283
[917]	train-rmse:0.241644	eval-rmse:0.272828
[918]	train-rmse:0.241598	eval-rmse:0.272804
[919]	train-rmse:0.241577	eval-rmse:0.272799
[920]	train-rmse:0.241569	eval-rmse:0.27279
[921]	train-rmse:0.241552	eval-rmse:0.272763
[922]	train-rmse:0.241485	eval-rmse:0.272706
[923]	train-rmse:0.241472	eval-rmse:0.272718
[924]	train-rmse:0.24147	eval-rmse:0.272717
[925]	train-rmse:0.241463	eval-rmse:0.27272
[926]	train-rmse:0.241443	eval-rmse:0.272699
[927]	train-rmse:0.241434	eval-rmse:0.272698
[928]	train-rmse:0.241427	eval-rmse:0.272701
[929]	train-rmse:0.241406	eval-rmse:0.272694
[930]	train-rmse:0.241383	eval-rmse:0.272668
[931]	train-rmse:0.241364	eval-rmse:0.272675
[932]	train-rmse:0.24135	eval-rmse:0.272654
[933]	train-rmse:0.241335	eval-rmse:0.272652
[934]	train-rmse:0.241324	eval-rmse:0.272643
[935]	train-rmse:0.241312	eval-rmse:0.272625
[936]	train-rmse

In [53]:
predictions = xgb_model.predict(dtest)

In [54]:
from sklearn.metrics import r2_score
r2_score(test[y],predictions)

0.8127371426439756