In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
df = pd.read_csv('insurance.csv')
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [65]:
df.shape

(1327, 7)

In [4]:
df.charges.min()

1121.8739

In [5]:
df.charges.max()

63770.42801

In [6]:
df['charges'].mean() + 3*df['charges'].std()

49600.45597522324

In [7]:
df['charges'].mean() - 3*df['charges'].std()

-23059.611444940725

In [8]:
df[df['charges'] > df['charges'].mean() + 3*df['charges'].std()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
34,28,male,36.4,1,yes,southwest,51194.55914
543,54,female,47.41,0,yes,southeast,63770.42801
577,31,female,38.095,1,yes,northeast,58571.07448
819,33,female,35.53,0,yes,northwest,55135.40209
1146,60,male,32.8,0,yes,southwest,52590.82939
1230,52,male,34.485,3,yes,northwest,60021.39897
1300,45,male,30.36,0,yes,southeast,62592.87309


In [9]:
df = df[(df['charges'] < df['charges'].mean() + 3*df['charges'].std())].copy()

In [10]:
df.bmi.min()

15.96

In [11]:
df.bmi.max()

53.13

In [12]:
df['bmi'].mean() + 3*df['bmi'].std()

48.899361817599484

In [13]:
df[(df['bmi'] > df['bmi'].mean() + 3*df['bmi'].std())]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
116,58,male,49.06,0,no,southeast,11381.3254
847,23,male,50.38,1,no,southeast,2438.0552
1047,22,male,52.58,1,yes,southeast,44501.3982
1317,18,male,53.13,0,no,southeast,1163.4627


In [14]:
df = df[(df['bmi'] < df['bmi'].mean() + 3*df['bmi'].std())].copy()

In [15]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [16]:
df[['sex', 'smoker', 'region']].nunique()

sex       2
smoker    2
region    4
dtype: int64

In [17]:
df.sex.replace(('female', 'male'), (1, 0), inplace=True)
df.smoker.replace(('yes', 'no'), (1, 0), inplace=True)

In [18]:
df.dtypes

age           int64
sex           int64
bmi         float64
children      int64
smoker        int64
region       object
charges     float64
dtype: object

In [19]:
X = df.drop(columns=['charges'])
y = df['charges'].values
dicts = X.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X = dv.fit_transform(dicts)

In [20]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_full_train, y_full_train, test_size=0.25, random_state=42)

In [21]:
model_1 = LinearRegression()
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_val)
np.sqrt(mean_squared_error(y_val, y_pred))

5690.67111845036

In [22]:
%%capture --no-display
estimators = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250]
depth = [2, 3, 4, 5, 10, 15, 20, 25]
best = pd.DataFrame(columns=["depth", "estimators", "RMSE_train", "RMSE_val"])
for d in depth:
    for n in estimators:
        rf = RandomForestRegressor(n_estimators=n, max_depth=d, random_state=42)
        rf.fit(X_train, y_train)
        y_pred_train = rf.predict(X_train)
        y_pred_val = rf.predict(X_val)
        RMSE_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        RMSE_val = np.sqrt(mean_squared_error(y_val, y_pred_val))
        best = best.append({'depth':d, 'estimators':n, 'RMSE_train':(round(RMSE_train, 5)), 'RMSE_val':(round(RMSE_val, 5))}, ignore_index=True)

In [23]:
best.sort_values(by='RMSE_val')

Unnamed: 0,depth,estimators,RMSE_train,RMSE_val
52,4.0,30.0,4102.05223,4097.88626
50,4.0,10.0,4132.82292,4098.07851
58,4.0,90.0,4104.10742,4100.92284
73,4.0,240.0,4107.16825,4101.51869
72,4.0,230.0,4107.88344,4101.64279
...,...,...,...,...
17,2.0,180.0,4929.40469,4574.69440
100,10.0,10.0,2201.39248,4689.28490
125,15.0,10.0,2098.38824,4785.28064
175,25.0,10.0,2096.31432,4788.24653


In [24]:
model_2 = RandomForestRegressor(n_estimators=30, max_depth=4, random_state=42)
model_2.fit(X_train, y_train)

In [25]:
pd.Series(model_2.feature_importances_, index=dv.get_feature_names()).sort_values()



region=southeast    0.000000
region=southwest    0.000000
region=northwest    0.000061
sex                 0.000142
region=northeast    0.001923
children            0.011885
age                 0.136407
bmi                 0.166050
smoker              0.683531
dtype: float64

In [326]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.get_feature_names())
dval = xgb.DMatrix(X_val, label=y_val, feature_names=dv.get_feature_names())
watchlist = [(dtrain, 'train'), (dval, 'val')]
columns = ['eta', 'depth', 'iter_num', 'train_rmse', 'val_rmse']
total = pd.DataFrame(columns=columns)
def parse_xgb_output(output, e, d):
    results = []
    
    for line in output.stdout.strip().split('\n'):
        it_line, train_line, val_line = line.split('\t')
        
        it = int(it_line.strip('[]'))
        train = float(train_line.split(':')[1])
        val = float(val_line.split(':')[1])
        
        results.append((e, d, it, train, val))
    df_results = pd.DataFrame(results, columns=columns)
    return df_results



In [327]:
%%capture output

eta = [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3]
depth = 1
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)
#total = total.append(parse_xgb_output(output, 0.3, depth), ignore_index=True).copy().reindex()

In [329]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
42,0.3,1,42,5561.85791,5576.93213
59,0.3,1,59,5526.39502,5578.02148
46,0.3,1,46,5550.63916,5578.15869
41,0.3,1,41,5564.9043,5578.32275
47,0.3,1,47,5547.85352,5578.3457


In [331]:
%%capture output

depth = 2
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)
#total = total.append(parse_xgb_output(output, 0.3, depth), ignore_index=True).copy().reindex()

In [332]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
17,0.3,2,17,4154.67676,4062.11743
15,0.3,2,15,4178.65088,4063.70239
16,0.3,2,16,4164.64746,4065.93164
18,0.3,2,18,4137.96924,4067.87866
19,0.3,2,19,4123.83252,4071.80029


In [333]:
%%capture output

depth = 3
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)
#total = total.append(parse_xgb_output(output, 0.3, depth), ignore_index=True).copy().reindex()

In [334]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
13,0.3,3,13,4025.43018,4100.24072
14,0.3,3,14,4008.03271,4103.56152
15,0.3,3,15,3990.25098,4110.43799
10,0.3,3,10,4109.35938,4117.20166
11,0.3,3,11,4083.11792,4119.26807


In [335]:
%%capture output

depth = 4
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [336]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
12,0.3,4,12,3758.92163,4159.74072
11,0.3,4,11,3803.05005,4168.61719
13,0.3,4,13,3719.99512,4168.97656
9,0.3,4,9,3948.22021,4170.62695
10,0.3,4,10,3873.76147,4176.27051


In [337]:
%%capture output

depth = 5
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [338]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
10,0.3,5,10,3547.25879,4188.44678
9,0.3,5,9,3619.03491,4207.75732
12,0.3,5,12,3382.25171,4217.08154
11,0.3,5,11,3460.66309,4220.9873
8,0.3,5,8,3710.88135,4222.85645


In [339]:
%%capture output

depth = 6
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [340]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
10,0.3,6,10,2986.36108,4297.46387
9,0.3,6,9,3116.78052,4301.73828
11,0.3,6,11,2940.16846,4310.84619
12,0.3,6,12,2857.03687,4327.67822
8,0.3,6,8,3260.45679,4352.2627


In [341]:
%%capture output

depth = 7
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [342]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
11,0.3,7,11,2340.68115,4488.66016
10,0.3,7,10,2416.40796,4492.34228
12,0.3,7,12,2244.89429,4495.0459
9,0.3,7,9,2643.61938,4513.22803
13,0.3,7,13,2167.08667,4516.96143


In [343]:
%%capture output

depth = 8
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [344]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
8,0.3,8,8,2389.61231,4617.20068
9,0.3,8,9,2161.20703,4632.82666
7,0.3,8,7,2653.9436,4642.00537
10,0.3,8,10,1959.61926,4645.19434
11,0.3,8,11,1820.22034,4645.63281


In [345]:
%%capture output

depth = 9
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [346]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
10,0.3,9,10,1581.48828,4763.48242
11,0.3,9,11,1426.54285,4778.06641
9,0.3,9,9,1761.04236,4788.875
8,0.3,9,8,2044.08826,4796.21045
12,0.3,9,12,1309.18298,4808.29053


In [347]:
%%capture output

depth = 10
xgb_params = {
    'eta': 0.3,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [348]:
parse_xgb_output(output, 0.3, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
8,0.3,10,8,1908.07361,4839.16357
7,0.3,10,7,2258.62061,4856.93603
9,0.3,10,9,1658.07202,4858.146
10,0.3,10,10,1437.83569,4865.09863
11,0.3,10,11,1231.18811,4874.62695


In [349]:
%%capture output

eta = 0.01
depth = 2
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [350]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
99,0.01,2,99,7890.63916,7601.27002
98,0.01,2,98,7940.30713,7651.41699
97,0.01,2,97,7990.63525,7700.6084
96,0.01,2,96,8041.61475,7751.31494
95,0.01,2,95,8093.2666,7801.16846


In [351]:
%%capture output

eta = 0.05
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [352]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
98,0.05,2,98,4223.85352,4123.12158
99,0.05,2,99,4221.2749,4123.51514
97,0.05,2,97,4230.86084,4131.13281
95,0.05,2,95,4238.69043,4131.44531
94,0.05,2,94,4242.08057,4131.64404


In [353]:
%%capture output

eta = 0.1
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [354]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
70,0.1,2,70,4104.75928,4067.69165
71,0.1,2,71,4102.85791,4068.67481
69,0.1,2,69,4106.3418,4069.46558
68,0.1,2,68,4109.64941,4069.75952
64,0.1,2,64,4124.81152,4069.99829


In [355]:
%%capture output

eta = 0.15
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [356]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
43,0.15,2,43,4106.56006,4058.47461
48,0.15,2,48,4082.49707,4059.97339
46,0.15,2,46,4090.16235,4060.0542
44,0.15,2,44,4100.78955,4061.57959
50,0.15,2,50,4075.68066,4061.66309


In [357]:
%%capture output

eta = 0.2
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [358]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
39,0.2,2,39,4089.74805,4076.13379
40,0.2,2,40,4084.39307,4077.40698
31,0.2,2,31,4135.69922,4078.84668
35,0.2,2,35,4109.5791,4078.91626
27,0.2,2,27,4180.20947,4079.53271


In [359]:
%%capture output

eta = 0.25
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [360]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
32,0.25,2,32,4044.39282,4048.00464
30,0.25,2,30,4056.70386,4048.33325
31,0.25,2,31,4049.80322,4051.82764
25,0.25,2,25,4109.74316,4051.87793
24,0.25,2,24,4119.70166,4054.35327


In [361]:
%%capture output

eta = 0.35
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [362]:
parse_xgb_output(output, eta, depth).sort_values(by='val_rmse').head()

Unnamed: 0,eta,depth,iter_num,train_rmse,val_rmse
17,0.35,2,17,4120.29785,4066.99976
16,0.35,2,16,4139.69629,4073.18286
19,0.35,2,19,4100.26367,4074.95044
18,0.35,2,18,4112.38135,4075.25293
20,0.35,2,20,4087.20557,4082.02588


In [364]:
%%capture

eta = 0.25
depth = 2
xgb_params = {
    'eta': eta,
    'max_depth': depth,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'rmse',
    'seed': 42,
    'verbosity': 1,
}
model_3 = xgb.train(xgb_params, dtrain, evals=watchlist, num_boost_round=100)

In [374]:
y_pred = model_3.predict(dval, iteration_range=(0, 33))
np.sqrt(mean_squared_error(y_val, y_pred))

4048.0045426801703