In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestRegressor

In [13]:
data = pd.read_csv('D:/Music/CSV/Data_CHF_Zhao_2020_ATE.csv')
data

Unnamed: 0,id,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,1,Inasaka,tube,0.39,5600,-0.1041,3.0,3.0,100,11.3
1,2,Inasaka,tube,0.31,6700,-0.0596,3.0,3.0,100,10.6
2,3,Inasaka,tube,0.33,4300,-0.0395,3.0,3.0,100,7.3
3,4,Inasaka,tube,0.62,6400,-0.1460,3.0,3.0,100,12.8
4,5,Inasaka,tube,0.64,4700,-0.0849,3.0,3.0,100,11.0
...,...,...,...,...,...,...,...,...,...,...
1860,1861,Richenderfer,plate,1.01,1500,-0.0218,15.0,120.0,10,9.4
1861,1862,Richenderfer,plate,1.01,1500,-0.0434,15.0,120.0,10,10.4
1862,1863,Richenderfer,plate,1.01,2000,-0.0109,15.0,120.0,10,10.8
1863,1864,Richenderfer,plate,1.01,2000,-0.0218,15.0,120.0,10,10.9


In [14]:
data.isna().sum()

id                     0
author                 0
geometry               0
pressure [MPa]         0
mass_flux [kg/m2-s]    0
x_e_out [-]            0
D_e [mm]               0
D_h [mm]               0
length [mm]            0
chf_exp [MW/m2]        0
dtype: int64

In [15]:
data = data.drop(['id','author'], axis = 1)
data

Unnamed: 0,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
0,tube,0.39,5600,-0.1041,3.0,3.0,100,11.3
1,tube,0.31,6700,-0.0596,3.0,3.0,100,10.6
2,tube,0.33,4300,-0.0395,3.0,3.0,100,7.3
3,tube,0.62,6400,-0.1460,3.0,3.0,100,12.8
4,tube,0.64,4700,-0.0849,3.0,3.0,100,11.0
...,...,...,...,...,...,...,...,...
1860,plate,1.01,1500,-0.0218,15.0,120.0,10,9.4
1861,plate,1.01,1500,-0.0434,15.0,120.0,10,10.4
1862,plate,1.01,2000,-0.0109,15.0,120.0,10,10.8
1863,plate,1.01,2000,-0.0218,15.0,120.0,10,10.9


In [23]:
data = data.sample(frac = 1.0, random_state = 2)

In [24]:
X = data.drop('chf_exp [MW/m2]', axis = 1)
Y = data['chf_exp [MW/m2]']
X

Unnamed: 0,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm]
1352,tube,13.79,1329,-0.2323,11.1,11.1,457
773,tube,13.79,2563,0.1036,4.6,4.6,295
61,tube,3.86,4136,0.0436,5.6,5.6,216
461,tube,6.89,5479,-0.1100,23.6,23.6,610
574,tube,11.14,2034,-0.0254,9.3,9.3,762
...,...,...,...,...,...,...,...
1280,tube,13.79,1383,-0.1775,7.7,7.7,457
44,tube,2.14,5384,0.1854,1.1,1.1,114
1843,plate,0.20,1500,-0.0385,15.0,120.0,10
1582,annulus,6.89,2292,0.1018,12.7,42.3,2743


In [21]:
Y

400     2.8
571     3.2
351     3.2
1479    1.7
1371    3.6
       ... 
905     3.0
1791    4.0
1096    2.1
235     4.0
1061    2.0
Name: chf_exp [MW/m2], Length: 1865, dtype: float64

In [28]:
def build_model():
    
    nominal_transformer = Pipeline(steps = [
        ('onehot', OneHotEncoder(sparse = 'False', handle_unknown = 'ignore'))    
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('nominal', nominal_transformer, ['geometry'])
    ], remainder='passthrough')
    
    model = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(random_state = 1))
    ])
    
    return model

In [33]:
#Training
kf = KFold(n_splits = 5)

rmses = []

for train_idx, test_idx in kf.split(X):
    X_train = X.iloc[train_idx, :]
    X_test = X.iloc[test_idx, :]
    Y_train = Y.iloc[train_idx]
    Y_test = Y.iloc[test_idx]
    
    model = build_model()
    model.fit(X_train, Y_train)
    
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(np.mean((Y_test - y_pred)**2))
    rmses.append(rmse)

final_rmse = np.mean(rmses)
print("RMSE: {:.2f}".format(final_rmse))

RMSE: 0.63
