In [11]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score, mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = load_diabetes(as_frame = True).frame

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019907,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.068332,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.00567,-0.045599,-0.034194,-0.032356,-0.002592,0.002861,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022688,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031988,-0.046641,135.0


In [81]:
df.shape

(442, 11)

In [4]:
X = df.drop('target', axis = 1)
y = df['target']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.3,
                                                   random_state = 42)

dtr = DecisionTreeRegressor(max_depth= 7, min_samples_leaf=20)
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

print(f'R^2 Score : {r2_score(y_test, y_pred)}')
print(f'Mean Squared Error : {mean_squared_error(y_test, y_pred)}')

R^2 Score : 0.41135906558189406
Mean Squared Error : 3177.664925233041


In [51]:
full_tree = DecisionTreeRegressor(random_state=42)
full_tree.fit(X_train, y_train)

In [52]:
path = full_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas

print(ccp_alphas)

[0.00000000e+00 1.61812298e-03 1.61812298e-03 1.61812298e-03
 1.61812298e-03 1.61812298e-03 1.61812298e-03 1.61812298e-03
 1.61812298e-03 1.61812298e-03 1.61812298e-03 1.61812298e-03
 1.61812298e-03 1.61812298e-03 2.15749730e-03 4.31499461e-03
 4.85436893e-03 4.85436893e-03 6.47249191e-03 6.47249191e-03
 6.47249191e-03 6.47249191e-03 6.47249191e-03 6.47249191e-03
 6.47249191e-03 6.47249191e-03 8.62998921e-03 1.30528587e-02
 1.34843581e-02 1.34843581e-02 1.45631068e-02 1.45631068e-02
 1.45631068e-02 1.45631068e-02 1.45631068e-02 1.45631068e-02
 2.18446602e-02 2.33009709e-02 2.58899676e-02 2.58899676e-02
 2.58899676e-02 2.58899676e-02 2.58899676e-02 2.58899676e-02
 2.58899676e-02 2.58899676e-02 2.58899676e-02 2.58899676e-02
 2.58899676e-02 2.58899676e-02 2.58899676e-02 2.64293420e-02
 2.64293420e-02 3.45199569e-02 3.45199569e-02 3.49514563e-02
 4.04530744e-02 4.04530744e-02 4.04530744e-02 4.04530744e-02
 4.04530744e-02 4.04530744e-02 5.82524272e-02 5.82524272e-02
 5.82524272e-02 5.825242

In [54]:
# train our model for all alphas

trees = []

for alpha in ccp_alphas:
    model = DecisionTreeRegressor(random_state=42, ccp_alpha=alpha)
    model.fit(X_train, y_train)

    trees.append((model, alpha))

In [55]:
best_acc = 0
best_alpha = 0

for model, alpha in trees:
    curr_acc = model.score(X_test, y_test)
    if curr_acc > best_acc:
        best_acc = curr_acc
        best_alpha = alpha

In [56]:
best_acc

0.39516206956807265

In [78]:
best_model = DecisionTreeRegressor(ccp_alpha=best_alpha, max_depth=15)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
print(f'R^2 score : {r2_score(y_test,y_pred)}')
print(f'Mean Squared Error : {mean_squared_error(y_test, y_pred)}')

R^2 score : 0.39516206956807265
Mean Squared Error : 3265.1012945336884


In [80]:
y_pred_train = best_model.predict(X_train)
y_pred_test= best_model.predict(X_test)

print("MSE train: ", mean_squared_error(y_train, y_pred_train))
print("MSE test: ", mean_squared_error(y_test, y_pred_test))

print("r^2 train: ", r2_score(y_train, y_pred_train))
print("r^2 test: ", r2_score(y_test, y_pred_test))

MSE train:  3235.1232221005976
MSE test:  3265.1012945336884
r^2 train:  0.47381670061626757
r^2 test:  0.39516206956807265
