In [25]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler

training_file = '../智能制造训练集/train/inputs/training.csv'
testing_file = '../智能制造赛道测试集/test/testing.csv'

## 01 - loading and preprocess data

In [5]:
# load training & testing file
training = pd.read_csv(training_file, low_memory=False)
testing = pd.read_csv(testing_file, low_memory=False)

numerical_features = ['oxygen_set_value', # '氧量设定值'
                      'primary_air_volume', # '一次风量'
                      'grate_manual_command',  #  '炉排手动指令'
                      'main_steam_flow_set_value', # '主蒸汽流量设定值'
                      'ejector_manual_command', #  '推料器手动指令'
                      'ejector_automatic_command', # '推料器自动指令'
                      'nox_content', # NOx含量
                      'grate_actual_operation_command', # 炉排实际运行指令
                      'secondary_air_control_door', # 二次风调门
                      'hcl_content', # HCL含量
                      'secondary_air_volume', # 二次风量
                      'water_flow', # 给水流量
                      'so2_content', # SO2含量
                      'drum_water_level', # 汽包水位
                      'induced_fan_speed', # 引风机转速
                      'co_content', # CO含量
                      'primary_damper'] # 一次风调门

categorical_features = ['ejector_automatic_switching_signal', # '推料器自动投退信号'
                        'grate_automatic_switching_signal', # '炉排自动投退信号'
                        'pusher_on_or_off', # 推料器启停
                        'grate_on_or_off'] # 炉排启停

label = ['main_steam_flow']

# fill NA values using backfill func
training[categorical_features] = training[categorical_features].fillna(method='backfill')
testing[categorical_features] = testing[categorical_features].fillna(method='backfill')

In [6]:
training.head()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,ejector_automatic_switching_signal,grate_automatic_switching_signal,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,...,secondary_air_volume,water_flow,so2_content,pusher_on_or_off,drum_water_level,induced_fan_speed,grate_on_or_off,co_content,primary_damper,main_steam_flow
0,2021-12-20 00:00:00,5.5,72919.6563,True,True,43.9083,60.0,53.9083,53.9094,57.4444,...,4330.127,73.4054,2.6458,True,1.6872,66.7986,True,1.6042,75.5532,54.3032
1,2021-12-20 00:00:01,5.5,73034.8047,True,True,44.0,60.0,54.0,54.0353,57.3889,...,4898.98,73.5788,2.5833,True,1.6107,66.8056,True,1.5903,75.4144,54.1495
2,2021-12-20 00:00:02,5.5,73111.4609,True,True,44.0452,60.0,54.0452,54.0476,57.5556,...,4663.6895,73.7589,2.6181,True,1.5039,66.8195,True,1.6597,75.4051,54.4899
3,2021-12-20 00:00:03,5.5,73134.4375,True,True,44.0575,60.0,54.0575,54.06,57.537,...,4898.98,73.9473,2.6389,True,1.5284,66.7894,True,1.6736,75.4537,54.6693
4,2021-12-20 00:00:04,5.5,73493.5625,True,True,44.0699,60.0,54.0699,54.0724,57.3889,...,4898.98,74.1118,2.6875,True,1.6921,66.7662,True,1.6319,75.4144,54.5034


In [8]:
testing.head()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,ejector_automatic_switching_signal,grate_automatic_switching_signal,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,...,hcl_content,secondary_air_volume,water_flow,so2_content,pusher_on_or_off,drum_water_level,induced_fan_speed,grate_on_or_off,co_content,primary_damper
0,2021-12-22 23:30:00,25.0,72881.2344,False,False,50.0,60.0,40.0,70.0,111.4074,...,2.8333,18553.9766,68.6522,3.2569,False,7.5292,67.412,False,2.5278,65.537
1,2021-12-22 23:30:01,25.0,73729.4688,False,False,50.0,60.0,40.0,70.0,111.4074,...,2.8194,18472.9531,68.6643,3.2778,False,7.4884,67.4167,False,2.5208,65.5532
2,2021-12-22 23:30:02,25.0,74221.6875,False,False,50.0,60.0,40.0,70.0,111.463,...,2.7593,18553.9766,68.6404,3.3194,False,7.4432,67.4468,False,2.5625,65.5463
3,2021-12-22 23:30:03,25.0,73600.1953,False,False,50.0,60.0,40.0,70.0,109.3333,...,2.5093,18553.9766,68.622,2.8958,False,7.4716,67.4306,False,1.0347,65.5231
4,2021-12-22 23:30:04,25.0,73057.8047,False,False,50.0,60.0,40.0,70.0,109.2593,...,2.5046,18614.5098,68.6118,2.8333,False,7.56,67.4468,False,0.9722,65.5463


## 02 feature engineering

In [30]:
# append traning and testing dataframe (apply feature engineering together, avoid training and serving skew)
# adding label column for testing dataset before merge
testing['main_steam_flow'] = -1
data_all = pd.concat([training, testing], axis=0)
data_all.head()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,ejector_automatic_switching_signal,grate_automatic_switching_signal,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,...,secondary_air_volume,water_flow,so2_content,pusher_on_or_off,drum_water_level,induced_fan_speed,grate_on_or_off,co_content,primary_damper,main_steam_flow
0,2021-12-20 00:00:00,5.5,72919.6563,True,True,43.9083,60.0,53.9083,53.9094,57.4444,...,4330.127,73.4054,2.6458,True,1.6872,66.7986,True,1.6042,75.5532,54.3032
1,2021-12-20 00:00:01,5.5,73034.8047,True,True,44.0,60.0,54.0,54.0353,57.3889,...,4898.98,73.5788,2.5833,True,1.6107,66.8056,True,1.5903,75.4144,54.1495
2,2021-12-20 00:00:02,5.5,73111.4609,True,True,44.0452,60.0,54.0452,54.0476,57.5556,...,4663.6895,73.7589,2.6181,True,1.5039,66.8195,True,1.6597,75.4051,54.4899
3,2021-12-20 00:00:03,5.5,73134.4375,True,True,44.0575,60.0,54.0575,54.06,57.537,...,4898.98,73.9473,2.6389,True,1.5284,66.7894,True,1.6736,75.4537,54.6693
4,2021-12-20 00:00:04,5.5,73493.5625,True,True,44.0699,60.0,54.0699,54.0724,57.3889,...,4898.98,74.1118,2.6875,True,1.6921,66.7662,True,1.6319,75.4144,54.5034


In [31]:
data_all.tail()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,ejector_automatic_switching_signal,grate_automatic_switching_signal,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,...,secondary_air_volume,water_flow,so2_content,pusher_on_or_off,drum_water_level,induced_fan_speed,grate_on_or_off,co_content,primary_damper,main_steam_flow
1795,2021-12-22 23:59:55,25.0,79948.1719,True,True,50.0,60.0,100.0,100.0,71.4815,...,6244.998,68.5028,35.1597,False,14.6422,69.6412,False,2.0069,70.9051,-1.0
1796,2021-12-22 23:59:56,25.0,79695.5313,True,True,50.0,60.0,100.0,100.0,71.6481,...,6244.998,68.4625,35.1597,False,14.4919,69.4931,False,2.0347,70.9282,-1.0
1797,2021-12-22 23:59:57,25.0,79067.4688,True,True,50.0,60.0,100.0,100.0,71.6481,...,6244.998,68.4506,35.2014,False,14.4141,69.5486,False,2.0278,70.8194,-1.0
1798,2021-12-22 23:59:58,25.0,78748.0078,True,True,50.0,60.0,100.0,100.0,71.8519,...,6062.1777,68.4846,35.25,False,14.3788,69.7292,False,2.0764,70.6875,-1.0
1799,2021-12-22 23:59:59,25.0,78989.5,True,True,50.0,60.0,100.0,100.0,72.0556,...,5612.4863,68.573,33.75,False,14.0503,69.9769,False,2.0069,70.5995,-1.0


In [35]:
# normalization
scaler = MinMaxScaler()
# fit scaler
scaler.fit(data_all[numerical_features])
# print log
print(f"Scaler max value is {scaler.data_max_}, min value is {scaler.data_min_}")
# transform data
data_all[numerical_features] = scaler.transform(data_all[numerical_features])

data_all.head()

Scaler max value is [7.00000000e+01 1.22637047e+05 1.00000000e+02 7.00361000e+01
 1.00000200e+02 1.00000200e+02 1.34537000e+02 1.00000000e+02
 8.99792000e+01 5.80046000e+01 3.57910586e+04 1.00297200e+02
 2.99617800e+02 4.69223000e+01 9.12498000e+01 3.00208100e+02
 1.00023100e+02], min value is [ 2.88280000e+00  1.99712715e+04  3.00000000e+01  4.26552000e+01
  3.00000000e+01  4.00000000e+01  2.02778000e+01  2.99998000e+01
  9.68520000e+00  4.07400000e-01  0.00000000e+00  4.68703000e+01
 -1.45800000e-01 -2.79869000e+01  4.29398000e+01 -4.58100000e-01
  2.49977000e+01]


Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,ejector_automatic_switching_signal,grate_automatic_switching_signal,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,...,secondary_air_volume,water_flow,so2_content,pusher_on_or_off,drum_water_level,induced_fan_speed,grate_on_or_off,co_content,primary_damper,main_steam_flow
0,2021-12-20 00:00:00,0.038994,0.515735,True,True,0.19869,0.633463,0.341546,0.231823,0.325283,...,0.120983,0.496662,0.009313,True,0.396134,0.493869,True,0.006859,0.673845,54.3032
1,2021-12-20 00:00:01,0.038994,0.516857,True,True,0.2,0.633463,0.342856,0.233921,0.324797,...,0.136877,0.499907,0.009104,True,0.395113,0.494014,True,0.006813,0.671995,54.1495
2,2021-12-20 00:00:02,0.038994,0.517604,True,True,0.200646,0.633463,0.343502,0.234126,0.326256,...,0.130303,0.503278,0.00922,True,0.393687,0.494301,True,0.007044,0.671871,54.4899
3,2021-12-20 00:00:03,0.038994,0.517828,True,True,0.200821,0.633463,0.343678,0.234333,0.326094,...,0.136877,0.506805,0.00929,True,0.394014,0.493678,True,0.00709,0.672519,54.6693
4,2021-12-20 00:00:04,0.038994,0.521326,True,True,0.200999,0.633463,0.343855,0.234539,0.324797,...,0.136877,0.509884,0.009452,True,0.3962,0.493198,True,0.006951,0.671995,54.5034


In [41]:
# one-encoding
onehot_encoding = pd.get_dummies(data_all[categorical_features].astype(str))
# concat back to previous dataframe
data_all = pd.concat([data_all, onehot_encoding], axis=1)
# drop previous column
data_all = data_all.drop(columns=categorical_features)

data_all.head()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,grate_actual_operation_command,secondary_air_control_door,...,primary_damper,main_steam_flow,ejector_automatic_switching_signal_False,ejector_automatic_switching_signal_True,grate_automatic_switching_signal_False,grate_automatic_switching_signal_True,pusher_on_or_off_False,pusher_on_or_off_True,grate_on_or_off_False,grate_on_or_off_True
0,2021-12-20 00:00:00,0.038994,0.515735,0.19869,0.633463,0.341546,0.231823,0.325283,0.198685,0.134258,...,0.673845,54.3032,0,1,0,1,0,1,0,1
1,2021-12-20 00:00:01,0.038994,0.516857,0.2,0.633463,0.342856,0.233921,0.324797,0.198754,0.135699,...,0.671995,54.1495,0,1,0,1,0,1,0,1
2,2021-12-20 00:00:02,0.038994,0.517604,0.200646,0.633463,0.343502,0.234126,0.326256,0.200629,0.136939,...,0.671871,54.4899,0,1,0,1,0,1,0,1
3,2021-12-20 00:00:03,0.038994,0.517828,0.200821,0.633463,0.343678,0.234333,0.326094,0.200789,0.13717,...,0.672519,54.6693,0,1,0,1,0,1,0,1
4,2021-12-20 00:00:04,0.038994,0.521326,0.200999,0.633463,0.343855,0.234539,0.324797,0.200965,0.139101,...,0.671995,54.5034,0,1,0,1,0,1,0,1


In [44]:
# split data_all back to training and testing
training = data_all.loc[data_all['main_steam_flow']!= -1]
testing = data_all.loc[data_all['main_steam_flow']== -1]

training.head()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,grate_actual_operation_command,secondary_air_control_door,...,primary_damper,main_steam_flow,ejector_automatic_switching_signal_False,ejector_automatic_switching_signal_True,grate_automatic_switching_signal_False,grate_automatic_switching_signal_True,pusher_on_or_off_False,pusher_on_or_off_True,grate_on_or_off_False,grate_on_or_off_True
0,2021-12-20 00:00:00,0.038994,0.515735,0.19869,0.633463,0.341546,0.231823,0.325283,0.198685,0.134258,...,0.673845,54.3032,0,1,0,1,0,1,0,1
1,2021-12-20 00:00:01,0.038994,0.516857,0.2,0.633463,0.342856,0.233921,0.324797,0.198754,0.135699,...,0.671995,54.1495,0,1,0,1,0,1,0,1
2,2021-12-20 00:00:02,0.038994,0.517604,0.200646,0.633463,0.343502,0.234126,0.326256,0.200629,0.136939,...,0.671871,54.4899,0,1,0,1,0,1,0,1
3,2021-12-20 00:00:03,0.038994,0.517828,0.200821,0.633463,0.343678,0.234333,0.326094,0.200789,0.13717,...,0.672519,54.6693,0,1,0,1,0,1,0,1
4,2021-12-20 00:00:04,0.038994,0.521326,0.200999,0.633463,0.343855,0.234539,0.324797,0.200965,0.139101,...,0.671995,54.5034,0,1,0,1,0,1,0,1


In [48]:
training.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 257398 entries, 0 to 257397
Data columns (total 27 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   datetime                                  257398 non-null  object 
 1   oxygen_set_value                          257398 non-null  float64
 2   primary_air_volume                        257398 non-null  float64
 3   grate_manual_command                      257398 non-null  float64
 4   main_steam_flow_set_value                 257398 non-null  float64
 5   ejector_manual_command                    257398 non-null  float64
 6   ejector_automatic_command                 257398 non-null  float64
 7   nox_content                               257398 non-null  float64
 8   grate_actual_operation_command            257398 non-null  float64
 9   secondary_air_control_door                257398 non-null  float64
 10  hcl_content         

In [46]:
testing.head()

Unnamed: 0,datetime,oxygen_set_value,primary_air_volume,grate_manual_command,main_steam_flow_set_value,ejector_manual_command,ejector_automatic_command,nox_content,grate_actual_operation_command,secondary_air_control_door,...,primary_damper,main_steam_flow,ejector_automatic_switching_signal_False,ejector_automatic_switching_signal_True,grate_automatic_switching_signal_False,grate_automatic_switching_signal_True,pusher_on_or_off_False,pusher_on_or_off_True,grate_on_or_off_False,grate_on_or_off_True
0,2021-12-22 23:30:00,0.329531,0.515361,0.285714,0.633463,0.142857,0.499998,0.797569,0.285716,0.495286,...,0.540341,-1.0,1,0,1,0,1,0,1,0
1,2021-12-22 23:30:01,0.329531,0.523623,0.285714,0.633463,0.142857,0.499998,0.797569,0.285716,0.495084,...,0.540557,-1.0,1,0,1,0,1,0,1,0
2,2021-12-22 23:30:02,0.329531,0.528418,0.285714,0.633463,0.142857,0.499998,0.798056,0.285716,0.495545,...,0.540465,-1.0,1,0,1,0,1,0,1,0
3,2021-12-22 23:30:03,0.329531,0.522364,0.285714,0.633463,0.142857,0.499998,0.779416,0.285716,0.495863,...,0.540156,-1.0,1,0,1,0,1,0,1,0
4,2021-12-22 23:30:04,0.329531,0.517081,0.285714,0.633463,0.142857,0.499998,0.778769,0.285716,0.495747,...,0.540465,-1.0,1,0,1,0,1,0,1,0


In [49]:
testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1800 entries, 0 to 1799
Data columns (total 27 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   datetime                                  1800 non-null   object 
 1   oxygen_set_value                          1800 non-null   float64
 2   primary_air_volume                        1800 non-null   float64
 3   grate_manual_command                      1800 non-null   float64
 4   main_steam_flow_set_value                 1800 non-null   float64
 5   ejector_manual_command                    1800 non-null   float64
 6   ejector_automatic_command                 1800 non-null   float64
 7   nox_content                               1800 non-null   float64
 8   grate_actual_operation_command            1800 non-null   float64
 9   secondary_air_control_door                1800 non-null   float64
 10  hcl_content                         

## 02 - Modeling - LightGBM

In [61]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

categorical_columns_onehot = [s for s in onehot_encoding.columns.values]

x = training[numerical_features + categorical_columns_onehot]
y = training[label]

# try grid search
estimator = LGBMRegressor(learning_rate=0.01,
                          reg_alpha=0.01, 
                          reg_lambda=0.01,
                          n_jobs=5,
                          random_state=42)
param_grid = {
    'n_estimators': [500, 510, 550],
    'max_depth': [4, 5],
    'num_leaves':[30,31,32]
}
gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x, y)

print('Best parameters found by grid search are:', gbm.best_params_)
print('Best scores:', gbm.best_score_)


Best parameters found by grid search are: {'max_depth': 4, 'n_estimators': 510, 'num_leaves': 30}
Best scores: 0.5538714844441648


In [63]:
# retrieve test_x
test_x = testing[numerical_features + categorical_columns_onehot]
y_pred = gbm.predict(test_x)
print(y_pred)

[58.23813451 58.23813451 58.23813451 ... 56.82550398 56.95505707
 56.46885195]


In [64]:
# generate submission file
submission_file = pd.DataFrame(data={'ID':[i for i in range(1, 1801)], 'Time':testing['datetime'].values, 'Steam_flow':y_pred})
submission_file.head()

Unnamed: 0,ID,Time,Steam_flow
0,1,2021-12-22 23:30:00,58.238135
1,2,2021-12-22 23:30:01,58.238135
2,3,2021-12-22 23:30:02,58.238135
3,4,2021-12-22 23:30:03,58.27966
4,5,2021-12-22 23:30:04,58.27966


In [65]:
submission_file.tail()

Unnamed: 0,ID,Time,Steam_flow
1795,1796,2021-12-22 23:59:55,56.955057
1796,1797,2021-12-22 23:59:56,56.825504
1797,1798,2021-12-22 23:59:57,56.825504
1798,1799,2021-12-22 23:59:58,56.955057
1799,1800,2021-12-22 23:59:59,56.468852


In [66]:
# write to local disk
submission_file.to_csv('../submission_lgbm.csv', index=False)

In [21]:
gbm.best_estimator_

In [23]:
gbm.best_score_

0.5573270404872812