In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
raw_data = pd.read_csv('propulsion.csv')
data = raw_data.copy()
data = data.drop('Unnamed: 0', axis = 1)

In [3]:
data

Unnamed: 0,Lever position (lp) [ ],Ship speed (v) [knots],Gas Turbine shaft torque (GTT) [kN m],Gas Turbine rate of revolutions (GTn) [rpm],Gas Generator rate of revolutions (GGn) [rpm],Starboard Propeller Torque (Ts) [kN],Port Propeller Torque (Tp) [kN],HP Turbine exit temperature (T48) [C],GT Compressor inlet air temperature (T1) [C],GT Compressor outlet air temperature (T2) [C],HP Turbine exit pressure (P48) [bar],GT Compressor inlet air pressure (P1) [bar],GT Compressor outlet air pressure (P2) [bar],Gas Turbine exhaust gas pressure (Pexh) [bar],Turbine Injecton Control (TIC) [%],Fuel flow (mf) [kg/s],GT Compressor decay state coefficient.,GT Turbine decay state coefficient.
0,9.300,27.0,72762.205,3560.393,9753.812,644.806,644.806,1086.583,288.0,780.304,4.523,0.998,22.879,1.050,90.435,1.790,0.973,0.978
1,6.175,18.0,29760.552,2306.825,8780.012,246.011,246.011,776.921,288.0,665.511,2.518,0.998,13.438,1.030,34.596,0.685,0.995,0.975
2,3.144,9.0,8375.774,1386.739,7051.621,60.318,60.318,589.764,288.0,580.587,1.392,0.998,7.566,1.020,12.477,0.247,0.980,0.978
3,3.144,9.0,8377.589,1386.748,7098.469,60.339,60.339,570.651,288.0,576.565,1.390,0.998,7.409,1.021,11.678,0.231,0.984,1.000
4,6.175,18.0,29761.091,2306.825,8782.024,246.021,246.021,769.855,288.0,663.682,2.518,0.998,13.374,1.031,34.154,0.676,0.998,0.980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2.088,6.0,5858.780,1349.023,6736.273,23.171,23.171,581.017,288.0,564.922,1.294,0.998,6.991,1.019,21.583,0.245,0.999,0.988
9996,5.140,15.0,21633.743,1924.349,8497.158,175.288,175.288,696.232,288.0,635.894,2.078,0.998,10.947,1.026,24.481,0.485,0.971,1.000
9997,8.206,24.0,50994.819,3087.555,9324.455,438.051,438.051,928.531,288.0,737.443,3.560,0.998,18.200,1.041,60.306,1.194,0.953,0.996
9998,6.175,18.0,29761.078,2306.850,8793.302,245.973,245.973,783.490,288.0,668.558,2.513,0.998,13.412,1.030,34.919,0.691,0.982,0.975


######   

## Splitting training, validation and test datasets

In [4]:
x = data.drop(["GT Compressor decay state coefficient.", "GT Turbine decay state coefficient."], axis = 1)
y = data[['GT Compressor decay state coefficient.', 'GT Turbine decay state coefficient.']]

In [5]:
from sklearn.model_selection import train_test_split

#Training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)

#Training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.1, random_state = 42)

print("Training set shape: ", x_train.shape, y_train.shape)
print("Validation set shape: ", x_val.shape, y_val.shape)
print("Test set shape: ", x_test.shape, y_test.shape)

Training set shape:  (8100, 16) (8100, 2)
Validation set shape:  (900, 16) (900, 2)
Test set shape:  (1000, 16) (1000, 2)


## Scaling the data

In [6]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler() 
  
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test) 

###   

At first, let's create a DataFrame that will store the validation results of all the models so that it can be compared later on.

In [7]:
Results = pd.DataFrame(columns = ['Model', 'Validation Score(Before tuning)', 'Validation Score(After tuning)' ])

#### 1. Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

lr =  LinearRegression()

lr.fit(x_train_scaled, y_train)

y_val_pred = lr.predict(x_val_scaled)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("RMSE score: ", rmse)

RMSE score:  0.004489483213433881


In [9]:
y_val_pred

array([[0.9888762 , 0.99931981],
       [0.95199644, 0.97555979],
       [0.96116147, 0.97550758],
       ...,
       [0.9926657 , 0.98965223],
       [0.97296484, 0.97960176],
       [0.96374826, 0.97626839]])

In [10]:
Results = Results.append({'Model' : 'Linear Regression', 'Validation Score(Before tuning)': rmse}, ignore_index = True)

In [11]:
Results

Unnamed: 0,Model,Validation Score(Before tuning),Validation Score(After tuning)
0,Linear Regression,0.004489,


#####   

#### 2. Random Forest

In [12]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()

rf.fit(x_train_scaled, y_train)

y_val_pred = rf.predict(x_val_scaled)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("RMSE score: ", rmse)

RMSE score:  0.0010582894216611832


In [13]:
Results = Results.append({'Model' : 'Random Forest', 'Validation Score(Before tuning)': rmse}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Validation Score(After tuning)
0,Linear Regression,0.004489,
1,Random Forest,0.001058,


#####  

#### 3. K Nearest Neighbours

In [15]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor()

knn.fit(x_train_scaled, y_train)

y_val_pred = knn.predict(x_val_scaled)

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

print("RMSE score: ", rmse)

RMSE score:  0.0017650747544761093


In [17]:
Results = Results.append({'Model' : 'K Nearest Neighbours', 'Validation Score(Before tuning)': rmse}, ignore_index = True)
Results

Unnamed: 0,Model,Validation Score(Before tuning),Validation Score(After tuning)
0,Linear Regression,0.004489,
1,Random Forest,0.001058,
2,K Nearest Neighbours,0.001765,
