### 1. Import Libraries & Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
# create Pandas Data Frame
df = pd.read_csv("../../C_Datasets/titanic_data_train.csv")
df.head()

Unnamed: 0,MODELYEAR,MAKE,MODEL,VEHICLECLASS,ENGINESIZE,CYLINDERS,TRANSMISSION,FUELTYPE,FUELCONSUMPTION_CITY,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,FUELCONSUMPTION_COMB_MPG,CO2EMISSIONS
0,2014,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,2014,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,2014,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,2014,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,2014,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


### 2. Standardization

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [4]:
# shrink the Data Frame to be understandable
df_regression = df[["ENGINESIZE", "FUELCONSUMPTION_HWY", "FUELCONSUMPTION_COMB", "CO2EMISSIONS"]]
df_regression.sample(7)

Unnamed: 0,ENGINESIZE,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,CO2EMISSIONS
998,4.0,11.3,13.1,301
639,2.0,7.9,9.8,225
519,2.4,8.3,10.0,230
714,2.5,8.3,9.8,225
215,5.3,13.9,16.5,380
521,2.4,9.3,10.6,244
543,2.5,8.2,8.5,196


In [5]:
df_regression.loc[:, "ENGINESIZE"] = scaler.fit_transform(df_regression.iloc[:, :-1])[:, 0]
df_regression.loc[:, "FUELCONSUMPTION_HWY"] = scaler.fit_transform(df_regression.iloc[:, :-1])[:, 1]
df_regression.loc[:, "FUELCONSUMPTION_COMB"] = scaler.fit_transform(df_regression.iloc[:, :-1])[:, 2]

In [6]:
df_regression.sample(7)

Unnamed: 0,ENGINESIZE,FUELCONSUMPTION_HWY,FUELCONSUMPTION_COMB,CO2EMISSIONS
138,-0.244694,-0.169913,-0.16673,253
801,-1.233931,-0.706932,-0.884304,196
34,0.461905,0.080695,0.263814,288
428,-1.233931,-0.527925,-0.597274,218
953,-0.597993,0.009093,-0.109324,258
696,-0.597993,-0.742733,-0.855601,198
795,-1.233931,-0.993341,-1.027819,184


### 3. Split Data Frame to Train & Test Set

In [7]:
from sklearn.model_selection import train_test_split

X = df_regression.drop(columns=["CO2EMISSIONS"]).values
y = df_regression["CO2EMISSIONS"].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### 4. Create Model

In [10]:
from sklearn.tree import DecisionTreeRegressor

tree_reg_model = DecisionTreeRegressor().fit(X_train, y_train)

In [11]:
y_pred = tree_reg_model.predict(X_test)

### 5. Evaluation

In [12]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
# Root Mean Squared Error (RMSE) 
RMSE = np.sqrt(MSE)
r2 = r2_score(y_test, y_pred)

In [13]:
pd.DataFrame({"Model": "Decision Tree Regression",
              "MAE": MAE,
              "MSE": MSE,
              "RMSE": RMSE,
              "R2": r2},
            index = [0])

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,Decision Tree Regression,5.071161,350.179775,18.713091,0.915578


### 6. Optimization

In [25]:
from sklearn.model_selection import GridSearchCV

parameters = {"max_depth": [3, 4, 5, 6, 7, 8, 9, 10, 12, None],
              "max_leaf_nodes": [5, 10, 15, 20, 25, 30, 35, None],
              "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"], 
              "ccp_alpha": [0.001, 0.01, 0.1, 0.0, 1, 10, 100, 100]}

tree_reg_model_grid = DecisionTreeRegressor()
tree_reg_model_grid = GridSearchCV(tree_reg_model_grid, parameters)

In [26]:
tree_reg_model_grid.fit(X_train, y_train)

In [27]:
tree_reg_model_grid.best_params_

{'ccp_alpha': 0.001,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_leaf_nodes': None}

In [28]:
tree_reg_model = DecisionTreeRegressor(ccp_alpha=0.001).fit(X_train, y_train)
y_pred = tree_reg_model.predict(X_test)
MAE = mean_absolute_error(y_test, y_pred)
MSE = mean_squared_error(y_test, y_pred)
# Root Mean Squared Error (RMSE) 
RMSE = np.sqrt(MSE)
r2 = r2_score(y_test, y_pred)
pd.DataFrame({"Model": "Decision Tree Regression",
              "MAE": MAE,
              "MSE": MSE,
              "RMSE": RMSE,
              "R2": r2},
            index = [0])

Unnamed: 0,Model,MAE,MSE,RMSE,R2
0,Decision Tree Regression,4.952434,349.819663,18.703467,0.915665
