# 2. Training the model 

## 2.1 Loading dataset 

In [1]:
import pandas as pd 
import numpy as np 
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler 
from sklearn.svm import SVR

import matplotlib.pyplot as plt

In [2]:
# ambil dataset dari direktori 
# get data from csv file 
pd_data = pd.read_csv('independent_dataset_baru.csv', sep=';')

In [3]:
pd_data

Unnamed: 0,OSA,OSB,EA,EC,CNA,IRA,IRC,DC,BANDGAP
0,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000,2.14
1,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000,3.04
2,0.25,0.6,0.064935,0.358209,1.000000,0.762195,1.000000,0.331578,1.02
3,0.00,0.8,0.720779,0.358209,0.333333,0.341463,1.000000,0.331578,0.98
4,0.00,0.8,0.000000,0.335821,1.000000,1.000000,0.000000,0.767575,1.47
...,...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.358209,0.555556,0.597561,1.000000,0.331578,2.30
114,0.00,0.8,0.720779,0.298507,0.555556,0.597561,0.000000,0.796428,1.40
115,0.00,0.8,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000,1.00
116,0.00,0.4,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000,1.70


In [4]:
# separating data train : X and label : y 
X = pd_data.iloc[:,0:8]
y = pd_data['BANDGAP']

In [5]:
X

Unnamed: 0,OSA,OSB,EA,EC,CNA,IRA,IRC,DC
0,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000
1,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000
2,0.25,0.6,0.064935,0.358209,1.000000,0.762195,1.000000,0.331578
3,0.00,0.8,0.720779,0.358209,0.333333,0.341463,1.000000,0.331578
4,0.00,0.8,0.000000,0.335821,1.000000,1.000000,0.000000,0.767575
...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.358209,0.555556,0.597561,1.000000,0.331578
114,0.00,0.8,0.720779,0.298507,0.555556,0.597561,0.000000,0.796428
115,0.00,0.8,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000
116,0.00,0.4,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000


In [6]:
y

0      2.14
1      3.04
2      1.02
3      0.98
4      1.47
       ... 
113    2.30
114    1.40
115    1.00
116    1.70
117    1.32
Name: BANDGAP, Length: 118, dtype: float64

In [7]:
# preparing the data 
from sklearn.model_selection import train_test_split

# X = features, y = labels/target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# convert all to numpy array 
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

In [9]:
X_train_np.shape 

(94, 8)

In [10]:
X_test_np.shape

(24, 8)

In [11]:
y_train_np.shape

(94,)

In [12]:
y_test_np.shape

(24,)

In [13]:
y_train_np.reshape(94,1)
y_test_np.reshape(24,1)

array([[1.3 ],
       [1.56],
       [1.47],
       [2.27],
       [0.78],
       [1.7 ],
       [1.99],
       [0.57],
       [1.  ],
       [1.98],
       [0.91],
       [1.  ],
       [0.9 ],
       [2.1 ],
       [1.02],
       [0.77],
       [1.53],
       [2.14],
       [2.5 ],
       [0.99],
       [1.25],
       [0.93],
       [1.59],
       [1.73]])

In [14]:
# crossfold validation 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
total_X_train = []
total_y_train = []
total_X_val = []
total_y_val = []

for train_index, val_index in kf.split(X_train_np):
    print("Train:", train_index, "Validation:", val_index)
    X_train, X_val = X_train_np[train_index], X_train_np[val_index]
    y_train, y_val = y_train_np[train_index], y_train_np[val_index]
    total_X_train.append(X_train)
    total_X_val.append(X_val)
    total_y_train.append(y_train)
    total_y_val.append(y_val)
    

Train: [ 1  2  3  5  6  7  8  9 11 13 14 15 16 17 19 20 21 23 24 25 27 29 30 31
 32 33 34 36 37 38 41 42 43 45 46 47 48 50 51 52 53 54 56 57 58 59 60 61
 63 64 65 66 68 69 70 71 73 74 75 76 77 78 79 80 81 82 84 85 86 87 88 89
 91 92 93] Validation: [ 0  4 10 12 18 22 26 28 35 39 40 44 49 55 62 67 72 83 90]
Train: [ 0  1  2  3  4  6  7  8 10 12 13 14 17 18 19 20 21 22 23 24 25 26 27 28
 29 32 35 36 37 38 39 40 41 43 44 46 48 49 50 51 52 53 54 55 56 57 58 59
 60 61 62 63 64 67 71 72 73 74 75 76 79 80 81 82 83 84 85 86 87 88 89 90
 91 92 93] Validation: [ 5  9 11 15 16 30 31 33 34 42 45 47 65 66 68 69 70 77 78]
Train: [ 0  1  2  4  5  9 10 11 12 14 15 16 18 20 21 22 23 26 28 29 30 31 32 33
 34 35 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 54 55 57 58 59 60 61
 62 63 64 65 66 67 68 69 70 71 72 74 75 77 78 79 81 82 83 84 85 86 88 90
 91 92 93] Validation: [ 3  6  7  8 13 17 19 24 25 27 36 38 53 56 73 76 80 87 89]
Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22

In [15]:
print(len(total_X_train[0]))
print(len(total_X_train[1]))
print(len(total_X_train[2]))
print(len(total_X_train[3]))
print(len(total_X_train[4]))


75
75
75
75
76


## 2.2 Preparing the model 

In [16]:
from sklearn.metrics import r2_score
import optuna
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
#import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import math
import matplotlib.pyplot as plt
import shap

In [17]:
def CatBoostRegressorTraining():
#     params = {'iterations': 16937, 'learning_rate': 0.0802044556274633, 'depth': 7, 'random_seed': 397,
#               'metric_period': 407, 'od_wait': 148}
    
    params = {'iterations': 507, 'learning_rate': 0.0035589304597655382, 'depth': 6, 'l2_leaf_reg': 0.09153952172281106, 'random_strength': 0.013416159875383185, 'bagging_temperature': 0.13700779150148432, 'border_count': 51}
    
    xgb_reg = CatBoostRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [18]:
# training the model with Kfold 
def training_model(model_reg, total_data_train, total_target_train):

    Kfold = 5 
    total_model = []
    for index in range(Kfold): 
        model_reg.fit(total_data_train[index], total_target_train[index])
        total_model.append(model_reg)
    return total_model 

In [19]:
def evaluate_model(trained_model, total_data_val, total_target_val):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_val[index])
        result.append(round(r2_score(total_target_val[index],y_pred),4))
    return np.average(result)

In [20]:
def evaluate_model2(trained_model, total_data_val, total_target_val):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_val[index])
        result.append(round(r2_score(total_target_val[index],y_pred),4))
        result.append(round(mean_absolute_error(total_target_val[index],y_pred),4))
        result.append(round(mean_squared_error(total_target_val[index],y_pred),4))
    return np.average(result)

In [21]:
def evaluate_model_test(trained_model, total_data_test, total_target_test):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_test)
        result.append(round(r2_score(total_target_test,y_pred),4))
    return np.average(result)

In [22]:
def evaluate_model_test2(trained_model, total_data_test, total_target_test):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_test)
        result.append(round(mean_absolute_error(total_target_test,y_pred),4))
    return np.average(result)

In [23]:
def evaluate_model_test3(trained_model, total_data_test, total_target_test):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_test)
        result.append(round(mean_squared_error(total_target_test,y_pred),4))
    return np.average(result)

In [24]:
params = {'iterations': 507, 'depth': 6, 'learning_rate': 0.0035589304597655382, 'l2_leaf_reg': 0.09153952172281106, 'random_strength': 0.013416159875383185, 'bagging_temperature': 0.13700779150148432, 'border_count': 51}
model_reg = CatBoostRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

0:	learn: 1.2396484	total: 146ms	remaining: 1m 13s
1:	learn: 1.2363660	total: 146ms	remaining: 37s
2:	learn: 1.2330910	total: 147ms	remaining: 24.6s
3:	learn: 1.2298349	total: 147ms	remaining: 18.5s
4:	learn: 1.2265861	total: 147ms	remaining: 14.8s
5:	learn: 1.2233568	total: 147ms	remaining: 12.3s
6:	learn: 1.2201339	total: 148ms	remaining: 10.5s
7:	learn: 1.2169239	total: 148ms	remaining: 9.22s
8:	learn: 1.2137326	total: 148ms	remaining: 8.19s
9:	learn: 1.2105546	total: 148ms	remaining: 7.37s
10:	learn: 1.2073889	total: 149ms	remaining: 6.7s
11:	learn: 1.2042359	total: 149ms	remaining: 6.14s
12:	learn: 1.2010899	total: 149ms	remaining: 5.66s
13:	learn: 1.1979508	total: 149ms	remaining: 5.25s
14:	learn: 1.1948300	total: 149ms	remaining: 4.9s
15:	learn: 1.1917279	total: 150ms	remaining: 4.59s
16:	learn: 1.1886323	total: 150ms	remaining: 4.32s
17:	learn: 1.1855550	total: 150ms	remaining: 4.08s
18:	learn: 1.1824901	total: 150ms	remaining: 3.86s
19:	learn: 1.1794379	total: 151ms	remaining:

In [25]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.69006)

In [26]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.6934)

In [27]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.231)

In [28]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.087)

## 2.3 optimizing and training the model

## ADABOOST

In [55]:
params = {'n_estimators': 49, 'learning_rate': 0.09140896209090747, 'random_state': 40}
model_reg = AdaBoostRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [56]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.54768)

In [57]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.616)

In [58]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.2708)

In [59]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.10900000000000001)

In [62]:
# For tree ensembles (like AdaBoost with tree base estimators), TreeExplainer is efficient
explainer = shap.TreeExplainer(trained_model)
# If X_test is large, sample for speed/clarity in plots
X_display = X_test_np.sample(200, random_state=42)

shap_values = explainer.shap_values(X_display)
expected_value = explainer.expected_value

InvalidModelError: Model type not yet supported by TreeExplainer: <class 'list'>

## KNeighborsRegressor

In [34]:
params = {'n_neighbors': 31, 'leaf_size': 71, 'p': 1, 'n_jobs': 5}
model_reg = KNeighborsRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [35]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.19169999999999998)

In [36]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.402)

In [37]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.3556)

In [38]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.1697)

## RandomForestRegressor

In [39]:
params = {'n_estimators': 163, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 2}
model_reg = RandomForestRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [40]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.48704000000000003)

In [41]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.2626)

In [42]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.3537)

In [43]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.20929999999999999)

## SupportVectorRegression

In [44]:
params = {'coef0': 8.355814663808525, 'tol': 0.8122822323306137, 'epsilon': 0.3884557915550403, 'C': 2.427599716333231, 'degree': 8, 'max_iter': 80, 'cache_size': 359}
model_reg = SVR(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [45]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.37871999999999995)

In [46]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.5228)

In [47]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.2867)

In [48]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.1354)

## GradientBoostingRegressor

In [49]:
params = {'learning_rate': 0.0026024933851524555, 'alpha': 0.6888940259094042, 'n_estimators': 444, 'min_samples_leaf': 0.03288313515702686, 'min_samples_split': 0.3685733075339944, 'min_weight_fraction_leaf': 0.06567934584629062, 'max_depth': 40, 'min_impurity_decrease': 0.022532313181025488}
model_reg = GradientBoostingRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [50]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.33718)

In [51]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.5432)

In [52]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.3059)

In [53]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.1296)

In [None]:
#study.optimize(bossting_objectiveAdaBoostRegressor, n_trials=1000)

## 2.4 Performance of the model 

In [54]:
pip install shap

Note: you may need to restart the kernel to use updated packages.
