In [None]:


# import all libraries
import pandas as pd 
import lightgbm as lgb # ML_mode LightGBM
from sklearn.metrics import mean_absolute_error # MAE score
from sklearn.model_selection import train_test_split
from tqdm import tqdm # display the progress bar during the traversal of iterable objects such as for loops

# prepare the data
train_dataset = pd.read_csv("./data/train.csv") # original training data
test_dataset = pd.read_csv("./data/test.csv") # original test data

submit = pd.DataFrame() # The final submision
submit["序号"] = test_dataset["序号"] # align the order index

MAE_scores = dict() # 

# model training
pred_labels = list(train_dataset.columns[-34:]) # the 34 predicted label '17个上部温度&17个下部温度'
train_set, valid_set = train_test_split(train_dataset, test_size=0.2) 

# set LightGBM parameters：https://lightgbm.readthedocs.io/en/latest/Parameters.html
lgb_params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mae',
        'min_child_weight': 5,
        'num_leaves': 2 ** 5,
        'lambda_l2': 10,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 4,
        'learning_rate': 0.05,
        'seed': 2023,
        'nthread' : 16,
        'verbose' : -1,
    }

no_info = lgb.callback.log_evaluation(period=-1) # Disable the output of training logs
#In this way, during the model training process, the evaluation metrics for 
#each round will not be output, and the console will be clean, which is suitable for use
#when you don't want the screen to be constantly filled with information.

def data_feature(data: pd.DataFrame, pred_labels: list=None) -> pd.DataFrame:
    
    data = data.copy() 
    data = data.drop(columns=["序号"]) # remove ”序号“feature。
    
    data["时间"] = pd.to_datetime(data["时间"]) # format the feature"时间"
    data["month"] = data["时间"].dt.month # add new feature “month”
    data["day"] = data["时间"].dt.day # add feature “day”
    data["hour"] = data["时间"].dt.hour # add feature “hour”
    data["minute"] = data["时间"].dt.minute # add feature “minute”
    data["weekofyear"] = data["时间"].dt.isocalendar().week.astype(int) # add feature “weekofyear”，convert to int，otherwise LightGBM can't handle.
    data["dayofyear"] = data["时间"].dt.dayofyear # add feature “dayofyear”
    data["dayofweek"] = data["时间"].dt.dayofweek # add feature “dayofweek”，
    data["is_weekend"] = data["时间"].dt.dayofweek // 6 # add feature “is_weekend”，weekend=1 ，not weekend=0 

    data = data.drop(columns=["时间"]) # LightGBM can't deal with this feature drop it, and it has been represented in other feautures

    # cross features
    for i in range(1,18):
        data[f'流量{i}/上部温度设定{i}'] = data[f'流量{i}'] / data[f'上部温度设定{i}']   
        data[f'流量{i}/下部温度设定{i}'] = data[f'流量{i}'] / data[f'下部温度设定{i}']
        data[f'上部温度设定{i}/下部温度设定{i}'] = data[f'上部温度设定{i}'] / data[f'下部温度设定{i}']
        
    # historical shift
        data[f'last1_流量{i}'] = data[f'流量{i}'].shift(1)#当前数据往下移动，第 0 行是 NaN
        data[f'last1_上部温度设定{i}'] = data[f'上部温度设定{i}'].shift(1)
        data[f'last1_下部温度设定{i}'] = data[f'下部温度设定{i}'].shift(1)

    #  differential feature
        data[f'last1_diff_流量{i}'] = data[f'流量{i}'].diff(1)#当前值减去前 1 行的值
        data[f'last1_diff_上部温度设定{i}'] = data[f'上部温度设定{i}'].diff(1)
        data[f'last1_diff_下部温度设定{i}'] = data[f'下部温度设定{i}'].diff(1)
        
    #  window statistics
    for i in range(1,18):
        data[f'win3_mean_流量{i}'] = (data[f'流量{i}'].shift(1) + data[f'流量{i}'].shift(2) + data[f'流量{i}'].shift(3)) / 3
        data[f'win3_mean_上部温度设定{i}'] = (data[f'上部温度设定{i}'].shift(1) + data[f'上部温度设定{i}'].shift(2) + data[f'上部温度设定{i}'].shift(3)) / 3
        data[f'win3_mean_下部温度设定{i}'] = (data[f'下部温度设定{i}'].shift(1) + data[f'下部温度设定{i}'].shift(2) + data[f'下部温度设定{i}'].shift(3)) / 3

    if pred_labels: # if provide pred_labels, execute this part
        data = data.drop(columns=[*pred_labels]) # remove all labels that to be predicted
    
    return data # return the precessed data

test_features = data_feature(test_dataset) # process the test dataset

# Take the labels one by one from all the features to be predicted for training and prediction.
for pred_label in tqdm(pred_labels):
    train_features = data_feature(train_set, pred_labels=pred_labels) # Process the time'时间' features of the training set.
    train_labels = train_set[pred_label] # training set labels
    train_data = lgb.Dataset(train_features, label=train_labels) # format the data into model processable 

    valid_features = data_feature(valid_set, pred_labels=pred_labels) # validation set
    valid_labels = valid_set[pred_label] # validation set
    valid_data = lgb.Dataset(valid_features, label=valid_labels) # format the data into model processable

    # Training the model, the parameters are as follows: Import the model, set parameters, import the training set, 
    #set the number of model iterations (200), import the validation set, and disable output of logs.
    model = lgb.train(lgb_params, train_data, 200, valid_sets=valid_data, callbacks=[no_info])# It use no_info here

    valid_pred = model.predict(valid_features, num_iteration=model.best_iteration) # Select the model with the best performance for making predictions on the validation set
    test_pred = model.predict(test_features, num_iteration=model.best_iteration) # Select the model with the best performance for making predictions on the test set
    MAE_score = mean_absolute_error(valid_pred, valid_labels) # Calculate the MAE between the predicted data and the actual data in the validation set
    MAE_scores[pred_label] = MAE_score # save MAE_score 

    submit[pred_label] = test_pred # final test result
     
submit.to_csv('submit_result_new.csv', index=False) # save the final result into submit_result_new.csv。
print(MAE_scores) 

  data[f'流量{i}/上部温度设定{i}'] = data[f'流量{i}'] / data[f'上部温度设定{i}']
  data[f'流量{i}/下部温度设定{i}'] = data[f'流量{i}'] / data[f'下部温度设定{i}']
  data[f'上部温度设定{i}/下部温度设定{i}'] = data[f'上部温度设定{i}'] / data[f'下部温度设定{i}']
  data[f'last1_流量{i}'] = data[f'流量{i}'].shift(1)#当前数据往下移动，第 0 行是 NaN
  data[f'last1_上部温度设定{i}'] = data[f'上部温度设定{i}'].shift(1)
  data[f'last1_下部温度设定{i}'] = data[f'下部温度设定{i}'].shift(1)
  data[f'last1_diff_流量{i}'] = data[f'流量{i}'].diff(1)#当前值减去前 1 行的值
  data[f'last1_diff_上部温度设定{i}'] = data[f'上部温度设定{i}'].diff(1)
  data[f'last1_diff_下部温度设定{i}'] = data[f'下部温度设定{i}'].diff(1)
  data[f'流量{i}/上部温度设定{i}'] = data[f'流量{i}'] / data[f'上部温度设定{i}']
  data[f'流量{i}/下部温度设定{i}'] = data[f'流量{i}'] / data[f'下部温度设定{i}']
  data[f'上部温度设定{i}/下部温度设定{i}'] = data[f'上部温度设定{i}'] / data[f'下部温度设定{i}']
  data[f'last1_流量{i}'] = data[f'流量{i}'].shift(1)#当前数据往下移动，第 0 行是 NaN
  data[f'last1_上部温度设定{i}'] = data[f'上部温度设定{i}'].shift(1)
  data[f'last1_下部温度设定{i}'] = data[f'下部温度设定{i}'].shift(1)
  data[f'last1_diff_流量{i}'] = data[f'流量{

  0%|                                                    | 0/34 [00:00<?, ?it/s]