In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('BF_data.csv', index_col=[0])
df.head()

Unnamed: 0,DATE_TIME,CB_FLOW,CB_PRESS,CB_TEMP,STEAM_FLOW,STEAM_TEMP,STEAM_PRESS,O2_PRESS,O2_FLOW,O2_PER,...,TOP_TEMP,TOP_PRESS_1,CO,CO2,H2,SKIN_TEMP_AVG,SAT_1,SAT_2,SAT_3,SAT_4
0,2021-07-01 00:10:00,311727.0,3.15,129.0,4.0,213.0,3.34,3.2,7296.0,23.08,...,121.0,2.0,22.22,21.0,3.88,69.940478,73.583364,77.713731,82.332439,85.074844
1,2021-07-01 00:20:00,315163.0,3.16,129.0,4.0,209.0,3.35,3.2,7829.0,23.08,...,125.0,1.0,22.56,21.0,3.94,71.454476,74.666066,77.198904,82.604995,84.877672
2,2021-07-01 00:30:00,314595.0,3.16,128.0,4.0,205.0,3.35,3.21,7904.0,23.08,...,124.0,1.0,22.49,21.08,3.94,70.579462,74.837214,78.518159,84.475989,83.397999
3,2021-07-01 00:40:00,312465.0,3.16,127.0,4.0,200.0,3.35,3.21,7919.0,23.08,...,115.0,1.0,22.36,21.13,3.99,70.179791,75.008361,80.865417,84.880888,83.368013
4,2021-07-01 00:50:00,302981.0,3.11,126.0,4.0,194.0,3.29,3.16,7938.0,23.08,...,125.0,1.0,22.25,21.3,4.1,70.72847,75.799102,82.564532,84.282448,84.592822


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26471 entries, 0 to 26470
Data columns (total 30 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   DATE_TIME      26471 non-null  object 
 1   CB_FLOW        26471 non-null  float64
 2   CB_PRESS       26471 non-null  float64
 3   CB_TEMP        26471 non-null  float64
 4   STEAM_FLOW     26471 non-null  float64
 5   STEAM_TEMP     26471 non-null  float64
 6   STEAM_PRESS    26471 non-null  float64
 7   O2_PRESS       26471 non-null  float64
 8   O2_FLOW        26471 non-null  float64
 9   O2_PER         26471 non-null  float64
 10  PCI            26471 non-null  float64
 11  ATM_HUMID      26471 non-null  float64
 12  HB_TEMP        26471 non-null  float64
 13  HB_PRESS       26471 non-null  float64
 14  TOP_PRESS      26471 non-null  float64
 15  TOP_TEMP1      26471 non-null  float64
 16  TOP_TEMP2      26471 non-null  float64
 17  TOP_TEMP3      26471 non-null  float64
 18  TOP_TE

In [4]:
x_vars = df.drop(['SAT_1','SAT_2','SAT_3','SAT_4'],axis=1)
x_vars.drop('DATE_TIME',axis=1,inplace=True)
y_vars = df[['SAT_1','SAT_2','SAT_3','SAT_4']]

print(x_vars.shape, y_vars.shape)

(26471, 25) (26471, 4)


In [5]:
x_vars.head()

Unnamed: 0,CB_FLOW,CB_PRESS,CB_TEMP,STEAM_FLOW,STEAM_TEMP,STEAM_PRESS,O2_PRESS,O2_FLOW,O2_PER,PCI,...,TOP_TEMP2,TOP_TEMP3,TOP_TEMP4,TOP_SPRAY,TOP_TEMP,TOP_PRESS_1,CO,CO2,H2,SKIN_TEMP_AVG
0,311727.0,3.15,129.0,4.0,213.0,3.34,3.2,7296.0,23.08,32.0,...,135.0,107.0,130.0,0.0,121.0,2.0,22.22,21.0,3.88,69.940478
1,315163.0,3.16,129.0,4.0,209.0,3.35,3.2,7829.0,23.08,30.0,...,143.0,109.0,128.0,0.0,125.0,1.0,22.56,21.0,3.94,71.454476
2,314595.0,3.16,128.0,4.0,205.0,3.35,3.21,7904.0,23.08,31.0,...,138.0,110.0,124.0,0.0,124.0,1.0,22.49,21.08,3.94,70.579462
3,312465.0,3.16,127.0,4.0,200.0,3.35,3.21,7919.0,23.08,36.0,...,128.0,102.0,110.0,0.0,115.0,1.0,22.36,21.13,3.99,70.179791
4,302981.0,3.11,126.0,4.0,194.0,3.29,3.16,7938.0,23.08,36.0,...,139.0,112.0,124.0,0.0,125.0,1.0,22.25,21.3,4.1,70.72847


In [6]:
y_vars.head()

Unnamed: 0,SAT_1,SAT_2,SAT_3,SAT_4
0,73.583364,77.713731,82.332439,85.074844
1,74.666066,77.198904,82.604995,84.877672
2,74.837214,78.518159,84.475989,83.397999
3,75.008361,80.865417,84.880888,83.368013
4,75.799102,82.564532,84.282448,84.592822


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_vars, y_vars, test_size=0.3,  random_state=42)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
lr = LinearRegression()
lr.fit(x_train, y_train)

In [9]:
c = lr.intercept_
c

array([69.75014708, 69.79666831, 69.8043775 , 69.77220529])

In [10]:
m = lr.coef_
m

array([[-1.47585844e+00,  1.07797039e+00,  1.08985272e-01,
        -9.14112233e-01, -1.55299827e-01, -2.00055891e-01,
        -7.81716609e-01, -2.65688469e+00,  1.83146652e+00,
         4.23261920e-01, -2.18632028e-01, -7.59855834e-01,
        -2.36697381e-01,  1.35913223e+00, -3.92198384e+00,
        -2.46435369e+00, -2.67475828e+00, -4.39220547e+00,
         1.03051599e-01,  1.26552096e+01, -1.98219570e-01,
         1.06775349e-02, -1.62018849e-01,  1.30058036e+00,
         2.49895484e+01],
       [-1.19023723e+00,  3.55896174e-01,  2.11287969e-01,
        -1.86018259e+00, -1.89510508e-01, -4.80052676e-01,
        -1.45437350e+00, -4.94160284e+00,  3.65811665e+00,
         6.50222928e-01, -3.03389738e-01, -1.31325173e+00,
        -1.03493052e+00,  2.74767996e+00, -2.65255663e+00,
        -5.11603540e-01, -2.92189666e-01, -2.67049715e+00,
        -7.86352983e-02,  6.48537704e+00, -2.40437395e-01,
         1.48944353e-01,  4.60558154e-01,  2.24781732e+00,
         2.27450339e+01],
    

In [11]:
y_pred = lr.predict(x_test)
y_pred

array([[ 54.86198783,  56.84806571,  58.30170969,  59.51020349],
       [ 61.73938684,  63.5742979 ,  64.92099828,  66.14697267],
       [ 91.51907658,  90.79598544,  89.37433248,  87.71620144],
       ...,
       [ 98.23950941,  94.24841113,  90.34047821,  87.13652465],
       [119.73143134, 115.90115745, 111.92300096, 108.2047196 ],
       [ 90.88839488,  90.90065836,  90.43747501,  89.69827421]])

In [12]:
r2_score(y_test, y_pred)*100

81.42990332798361

In [13]:
import joblib
joblib.dump(lr, 'multiple_linear_regression.pkl')

['multiple_linear_regression.pkl']