# 2. Training the model 

## 2.1 Loading dataset 

In [1]:
import pandas as pd 
import numpy as np 
import sklearn 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler 
from sklearn.svm import SVR

import matplotlib.pyplot as plt

In [2]:
# ambil dataset dari direktori 
# get data from csv file 
pd_data = pd.read_csv('independent_dataset_baru.csv', sep=';')

In [3]:
pd_data

Unnamed: 0,OSA,OSB,EA,EC,CNA,IRA,IRC,DC,BANDGAP
0,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000,2.14
1,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000,3.04
2,0.25,0.6,0.064935,0.358209,1.000000,0.762195,1.000000,0.331578,1.02
3,0.00,0.8,0.720779,0.358209,0.333333,0.341463,1.000000,0.331578,0.98
4,0.00,0.8,0.000000,0.335821,1.000000,1.000000,0.000000,0.767575,1.47
...,...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.358209,0.555556,0.597561,1.000000,0.331578,2.30
114,0.00,0.8,0.720779,0.298507,0.555556,0.597561,0.000000,0.796428,1.40
115,0.00,0.8,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000,1.00
116,0.00,0.4,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000,1.70


In [4]:
# separating data train : X and label : y 
X = pd_data.iloc[:,0:8]
y = pd_data['BANDGAP']

In [5]:
X

Unnamed: 0,OSA,OSB,EA,EC,CNA,IRA,IRC,DC
0,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000
1,0.25,0.6,0.064935,1.000000,1.000000,0.762195,0.671642,0.000000
2,0.25,0.6,0.064935,0.358209,1.000000,0.762195,1.000000,0.331578
3,0.00,0.8,0.720779,0.358209,0.333333,0.341463,1.000000,0.331578
4,0.00,0.8,0.000000,0.335821,1.000000,1.000000,0.000000,0.767575
...,...,...,...,...,...,...,...,...
113,0.00,0.6,0.720779,0.358209,0.555556,0.597561,1.000000,0.331578
114,0.00,0.8,0.720779,0.298507,0.555556,0.597561,0.000000,0.796428
115,0.00,0.8,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000
116,0.00,0.4,0.720779,0.000000,0.555556,0.597561,0.350746,1.000000


In [6]:
y

0      2.14
1      3.04
2      1.02
3      0.98
4      1.47
       ... 
113    2.30
114    1.40
115    1.00
116    1.70
117    1.32
Name: BANDGAP, Length: 118, dtype: float64

In [7]:
# preparing the data 
from sklearn.model_selection import train_test_split

# X = features, y = labels/target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# convert all to numpy array 
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)
y_train_np = np.array(y_train)
y_test_np = np.array(y_test)

In [9]:
X_train_np.shape 

(94, 8)

In [10]:
X_test_np.shape

(24, 8)

In [11]:
y_train_np.shape

(94,)

In [12]:
y_test_np.shape

(24,)

In [13]:
y_train_np.reshape(94,1)
y_test_np.reshape(24,1)

array([[1.3 ],
       [1.56],
       [1.47],
       [2.27],
       [0.78],
       [1.7 ],
       [1.99],
       [0.57],
       [1.  ],
       [1.98],
       [0.91],
       [1.  ],
       [0.9 ],
       [2.1 ],
       [1.02],
       [0.77],
       [1.53],
       [2.14],
       [2.5 ],
       [0.99],
       [1.25],
       [0.93],
       [1.59],
       [1.73]])

In [14]:
# crossfold validation 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
total_X_train = []
total_y_train = []
total_X_val = []
total_y_val = []

for train_index, val_index in kf.split(X_train_np):
    print("Train:", train_index, "Validation:", val_index)
    X_train, X_val = X_train_np[train_index], X_train_np[val_index]
    y_train, y_val = y_train_np[train_index], y_train_np[val_index]
    total_X_train.append(X_train)
    total_X_val.append(X_val)
    total_y_train.append(y_train)
    total_y_val.append(y_val)
    

Train: [ 1  2  3  5  6  7  8  9 11 13 14 15 16 17 19 20 21 23 24 25 27 29 30 31
 32 33 34 36 37 38 41 42 43 45 46 47 48 50 51 52 53 54 56 57 58 59 60 61
 63 64 65 66 68 69 70 71 73 74 75 76 77 78 79 80 81 82 84 85 86 87 88 89
 91 92 93] Validation: [ 0  4 10 12 18 22 26 28 35 39 40 44 49 55 62 67 72 83 90]
Train: [ 0  1  2  3  4  6  7  8 10 12 13 14 17 18 19 20 21 22 23 24 25 26 27 28
 29 32 35 36 37 38 39 40 41 43 44 46 48 49 50 51 52 53 54 55 56 57 58 59
 60 61 62 63 64 67 71 72 73 74 75 76 79 80 81 82 83 84 85 86 87 88 89 90
 91 92 93] Validation: [ 5  9 11 15 16 30 31 33 34 42 45 47 65 66 68 69 70 77 78]
Train: [ 0  1  2  4  5  9 10 11 12 14 15 16 18 20 21 22 23 26 28 29 30 31 32 33
 34 35 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 54 55 57 58 59 60 61
 62 63 64 65 66 67 68 69 70 71 72 74 75 77 78 79 81 82 83 84 85 86 88 90
 91 92 93] Validation: [ 3  6  7  8 13 17 19 24 25 27 36 38 53 56 73 76 80 87 89]
Train: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22

In [15]:
print(len(total_X_train[0]))
print(len(total_X_train[1]))
print(len(total_X_train[2]))
print(len(total_X_train[3]))
print(len(total_X_train[4]))


75
75
75
75
76


## 2.2 Preparing the model 

In [16]:
from sklearn.metrics import r2_score
import optuna
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
#import xgboost as xgb
from sklearn.svm import SVR
from catboost import CatBoostRegressor, Pool
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
import math
import matplotlib.pyplot as plt
import shap

In [17]:
def CatBoostRegressorTraining():
#     params = {'iterations': 16937, 'learning_rate': 0.0802044556274633, 'depth': 7, 'random_seed': 397,
#               'metric_period': 407, 'od_wait': 148}
    
    params = {'iterations': 21243, 'learning_rate': 0.05975976829789766, 'depth': 10, 'random_seed': 332,
     'metric_period': 470, 'od_wait': 143}
    
    xgb_reg = CatBoostRegressor(**params)
    xgb_reg.fit(X_train, y_train)

    return xgb_reg

In [18]:
# training the model with Kfold 
def training_model(model_reg, total_data_train, total_target_train):

    Kfold = 5 
    total_model = []
    for index in range(Kfold): 
        model_reg.fit(total_data_train[index], total_target_train[index])
        total_model.append(model_reg)
    return total_model 

In [19]:
def evaluate_model(trained_model, total_data_val, total_target_val):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_val[index])
        result.append(round(r2_score(total_target_val[index],y_pred),4))
    return np.average(result)

In [20]:
def evaluate_model2(trained_model, total_data_val, total_target_val):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_val[index])
        result.append(round(r2_score(total_target_val[index],y_pred),4))
        result.append(round(mean_absolute_error(total_target_val[index],y_pred),4))
        result.append(round(mean_squared_error(total_target_val[index],y_pred),4))
    return np.average(result)

In [21]:
def evaluate_model_test(trained_model, total_data_test, total_target_test):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_test)
        result.append(round(r2_score(total_target_test,y_pred),4))
    return np.average(result)

In [22]:
def evaluate_model_test2(trained_model, total_data_test, total_target_test):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_test)
        result.append(round(mean_absolute_error(total_target_test,y_pred),4))
    return np.average(result)

In [23]:
def evaluate_model_test3(trained_model, total_data_test, total_target_test):
    result = []
    Kfold = 5
    for index in range(Kfold):
        y_pred = trained_model[index].predict(total_data_test)
        result.append(round(mean_squared_error(total_target_test,y_pred),4))
    return np.average(result)

In [24]:
params = {'iterations': 21243, 'learning_rate': 0.05975976829789766, 'depth': 10, 'random_seed': 332,
     'metric_period': 470, 'od_wait': 143}
model_reg = CatBoostRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

0:	learn: 1.2211615	total: 158ms	remaining: 55m 49s
470:	learn: 0.2700899	total: 455ms	remaining: 20.1s
940:	learn: 0.2682084	total: 786ms	remaining: 17s
1410:	learn: 0.2682044	total: 1.13s	remaining: 15.8s
1880:	learn: 0.2682044	total: 1.47s	remaining: 15.2s
2350:	learn: 0.2682044	total: 1.8s	remaining: 14.5s
2820:	learn: 0.2682044	total: 2.1s	remaining: 13.7s
3290:	learn: 0.2682044	total: 2.43s	remaining: 13.3s
3760:	learn: 0.2682044	total: 2.76s	remaining: 12.8s
4230:	learn: 0.2682044	total: 3.09s	remaining: 12.4s
4700:	learn: 0.2682044	total: 3.4s	remaining: 12s
5170:	learn: 0.2682044	total: 3.73s	remaining: 11.6s
5640:	learn: 0.2682044	total: 4.07s	remaining: 11.3s
6110:	learn: 0.2682044	total: 4.42s	remaining: 10.9s
6580:	learn: 0.2682044	total: 4.77s	remaining: 10.6s
7050:	learn: 0.2682044	total: 5.12s	remaining: 10.3s
7520:	learn: 0.2682044	total: 5.47s	remaining: 9.99s
7990:	learn: 0.2682044	total: 5.83s	remaining: 9.66s
8460:	learn: 0.2682044	total: 6.18s	remaining: 9.33s
893

In [25]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.8454200000000001)

In [26]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.2265)

In [27]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.3792)

In [28]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.21949999999999997)

## 2.3 optimizing and training the model

## ADABOOST

In [29]:
params = {'n_estimators': 28, 'learning_rate': 0.02437865813830957, 'random_state': 97}
model_reg = AdaBoostRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [30]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.42564)

In [31]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.0388)

In [32]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.4234)

In [33]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.2728)

## KNeighborsRegressor

In [34]:
params = {'n_neighbors': 13, 'leaf_size': 31, 'p': 2, 'n_jobs': 8}
model_reg = KNeighborsRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [35]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.23066)

In [36]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.1396)

In [37]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.3931)

In [38]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.24420000000000003)

## RandomForestRegressor

In [39]:
params = {'criterion': 'squared_error', 'n_estimators': 89, 'min_samples_leaf': 0.0477528566476102,
              'min_samples_split': 0.3493938886646444, 'min_weight_fraction_leaf': 0.07364614448551399,
              'max_depth': 11, 'n_jobs': 2,'ccp_alpha': 0.6192017360171039}
model_reg = RandomForestRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [40]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.051080000000000014)

In [41]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.0413)

In [42]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.45010000000000006)

In [43]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.2721)

## SupportVectorRegression

In [44]:
params = {'coef0': 9.775132889897574, 'tol': 0.006493993396391401, 'epsilon': 0.10002960976094372,
              'C': 9.885891357520826, 'degree': 4, 'max_iter': 5,'cache_size': 190}
model_reg = SVR(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)



In [45]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(-2.4722200000000005)

In [46]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(-13.883599999999998)

In [47]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(1.8806)

In [48]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(4.2242)

## GradientBoostingRegressor

In [49]:
params = {'learning_rate': 0.006299158193687641, 'alpha': 0.5373917219469443, 'loss': 'quantile',
              'criterion': 'friedman_mse', 'n_estimators': 489, 'min_samples_leaf': 0.15692793301439362,
              'min_samples_split': 0.1432950302525493, 'min_weight_fraction_leaf': 0.15218461265640065, 
              'max_depth': 36, 'min_impurity_decrease': 0.03560036254060231}
model_reg = GradientBoostingRegressor(**params)
trained_model = training_model(model_reg, total_X_train, total_y_train)

In [50]:
R2 = evaluate_model(trained_model, total_X_val, total_y_val)
R2

np.float64(0.34784)

In [51]:
R2 = evaluate_model_test(trained_model, X_test_np, y_test_np)
R2

np.float64(0.3786)

In [52]:
MAE = evaluate_model_test2(trained_model, X_test_np, y_test_np)
MAE

np.float64(0.328)

In [53]:
MSE = evaluate_model_test3(trained_model, X_test_np, y_test_np)
MSE

np.float64(0.1764)

## 2.5 OPTUNA


In [55]:
#import tensorflow as tf
#from tensorflow import keras

In [59]:
#print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

In [58]:
def objectiveAdaBoostRegressor(trial):

    n_estimators = trial.suggest_int('n_estimators', 1,50)
    learning_rate=trail.suggest_float("learning_rate", 1e-3, 1e-1,log=True)
    random_state= trail.suggest_int('random_state',1,100)
    
    model = AdaBoostRegressor(n_estimators=n_estimators,
    learning_rate=learning_rate,
    random_state=random_state)

    score = cross_val_score(model, X, y, n_jobs=-1, cv=5, scoring='neg_mean_squared_error').mean()

In [60]:
def bossting_objectiveAdaBoostRegressor(trail):
    result=[]
    n_estimators = trail.suggest_int('n_estimators',1,50)
    learning_rate=trail.suggest_float("learning_rate", 1e-3, 1e-1,log=True)
    random_state= trail.suggest_int('random_state',1,100)

    number_of_splits=trail.suggest_int('number_of_splits',4,7)
    k_fold=KFold(n_splits=number_of_splits)
    params={x:y for (x,y) in trail.params.items() if x!='number_of_splits'}
    for tr, tst in k_fold.split(train,label):
        reg = AdaBoostRegressor(**params)
        #X_train, X_test, y_train, y_test = train[tr],train[tst],label[tr],label[tst]
        reg.fit(X_train_np, y_train_np)
        result.append(round(r2_score(y_test_np,reg.predict(X_test_np)),4))
            
    return np.average(result)

In [61]:
#study = optuna.create_study(direction="maximize")

In [62]:
#study.optimize(bossting_objectiveAdaBoostRegressor, n_trials=1000)

## 2.4 Performance of the model 