In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd "/content/drive/MyDrive/时间预测"

/content/drive/MyDrive/时间预测


In [None]:
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')

读取数据集

In [None]:
rng = pd.date_range('2021/12/20',periods=257400,freq ='1S')
data = pd.DataFrame(rng)
data.columns = ['时间']
data['value'] = 1

In [None]:
workdir = "inputs/"
# 读取每一列
for filename in os.listdir(workdir):
    temp = pd.read_csv(workdir + filename)
    # 去掉有重复值
    temp = temp.drop_duplicates()
    temp.columns = ['时间',filename.split('.')[0]]
    data = pd.concat([data,temp[filename.split('.')[0]]],axis=1)

In [None]:
# 去掉辅助列， value没有用的
data.drop(['value'],axis=1,inplace=True)

In [None]:
# 保存原始的列名，之后用来差分
original_feat = [i for i in data.columns if i not in ['时间']]

In [None]:
# 每分钟进行一次采样，求平均值，然后进行填充，作为新的特征列
data.index = pd.to_datetime(data['时间'])
samp = data.resample(rule = 'min').mean()
feat = []
for i in samp.columns:
  feat.append(i+'m')
samp.columns = feat
data = pd.concat([data,samp],axis=1)
# 用前一个值填充，
data = data.fillna(method='ffill').reset_index(drop=True).drop(['时间'],axis=1)

In [None]:
# 读取标签
label = pd.read_csv("outputs/主蒸汽流量.csv")
data = pd.merge(data,label,left_index=True,right_index=True)
label = data['主蒸汽流量']
data.drop(['主蒸汽流量'],axis=1,inplace=True)

In [None]:
data.drop(['时间'],axis=1,inplace=True)

下面是读取测试集，和训练集一样地处理步骤

In [None]:
rng = pd.date_range('2021-12-22 23:30:00',periods=1800,freq ='1S')
datatest = pd.DataFrame(rng)
datatest.columns = ['时间']
datatest['value'] = 1

workdir = "test/"
for filename in os.listdir(workdir):
    temp = pd.read_csv(workdir + filename)
    # temp = temp.drop_duplicates().reset_index(drop=True)
    temp = temp.drop_duplicates()
    temp.columns = ['时间',filename.split('.')[0]]
    datatest = pd.concat([datatest,temp[filename.split('.')[0]]],axis=1)
datatest.drop(['value'],axis=1,inplace=True)

In [None]:
datatest.index = pd.to_datetime(datatest['时间'])
samp = datatest.resample(rule = 'min').mean()
# samp.columns.map(lambda x:x+'samp')
feat = []
for i in samp.columns:
  feat.append(i+'m')
samp.columns = feat
datatest = pd.concat([datatest,samp],axis=1)
datatest = datatest.fillna(method='ffill').reset_index(drop=True).drop(['时间'],axis=1)

In [None]:
# 训练集和测试集的列不一致，去掉不一致的列，没有搞明白哪里有问题
diff = list(set(datatest.columns)-set(data.columns))

In [None]:
datatest.drop(diff,axis=1,inplace=True)

In [None]:
# 合并，做同样的特征工程
data['train'] = True
datatest['train'] = False
all_data = pd.concat([data,datatest],axis=0)

In [None]:
log_feat = ['炉排实际运行指令','推料器手动指令','推料器自动指令','炉排手动指令','氧量设定值','主蒸汽流量设定值','一次风调门','二次风调门','一次风量','NOx含量','SO2含量','CO含量','二次风量','汽包水位','HCL含量','给水流量','引风机转速']

In [None]:
# 原始数值特征做log变换, 代替原来的特征
for i in log_feat:
  all_data[i] = all_data[i].map(lambda x:np.log(x))

In [None]:
# 一阶差分
for i in all_data.columns:
  all_data[i+'diff1'] = all_data[i].diff(1)

In [None]:
# 时间长度为2的一阶差分
for i in all_data.columns:
  all_data[i+'diff2'] = all_data[i].diff(2)

In [None]:
# 滑动窗口长度为1800秒，求均值
for i in all_data.columns:
  all_data[i+'roll1800'] = all_data[i].rolling(window=1800).mean()

In [None]:
# 二阶差分，没有使用
# first_feat = map(lambda x: x+'diff1',original_feat)
# for i in [*first_feat]:
#   all_data[i+'2diff'] = all_data[i].diff()

In [None]:
def difftime(feat,time):
  """
  feat:需要做差分的特征
  time:差分的时间--(1,2,3,60,120,480,960,1200,1800)
  """
  for i in original_feat:
    all_data[i+'diff'+str(time)] = all_data[i].diff(time)

In [None]:
# difftime(original_feat,1)
# difftime(original_feat,2)
difftime(original_feat,3)
difftime(original_feat,60)
difftime(original_feat,120)
difftime(original_feat,480)
difftime(original_feat,960)
difftime(original_feat,1800)

difftime(original_feat,3600)

In [None]:
# 滑动窗口
def rollmean(feat,time):
  """
  feat:需要做滑动窗口的特征
  time:窗口长度--(1,2,3,60,120,480,960,1200,1800)
  """
  for i in original_feat:
    all_data[i+'roll'+str(time)] = all_data[i].rolling(window=time).mean()

In [None]:
rollmean(original_feat,1)
rollmean(original_feat,2)
rollmean(original_feat,3)
rollmean(original_feat,60)
rollmean(original_feat,120)
rollmean(original_feat,480)
rollmean(original_feat,960)
rollmean(original_feat,1800)

rollmean(original_feat,3600)

In [None]:
def rollstd(feat,time):
  """
  feat:需要做滑动窗口的特征，求标准差
  time:窗口长度--(1,2,3,60,120,480,960,1200,1800)
  """
  for i in original_feat:
    all_data[i+'roll'+str(time)] = all_data[i].rolling(window=time).std()

In [None]:
rollstd(original_feat,1)
rollstd(original_feat,2)
rollstd(original_feat,3)
rollstd(original_feat,60)
rollstd(original_feat,120)
rollstd(original_feat,480)
rollstd(original_feat,960)
rollstd(original_feat,1800)

rollstd(original_feat,3600)

In [None]:
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2    
#     for col in df.columns:
#         col_type = df[col].dtypes
#         if col_type in numerics:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)  
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)    
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose: 
#         print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 
#                     100 * (start_mem - end_mem) / start_mem))
#     return df

In [None]:
# 全部使用0填充
all_data.fillna(0,inplace=True)
# reduce_mem_usage(all_data)

In [None]:
train_data = all_data[all_data['train'] == True]
test_data = all_data[all_data['train'] == False]

# 特征工程

In [None]:
# from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

In [None]:
# # 这里使用删除某些重要性为0的特征
# drops = [*zip(all_data.columns,estimator.feature_importances_)]
# for i in drops:
#   if int(i[1]) == 0 and i[0] != 'train':
#     all_data.drop(i[0],axis=1,inplace=True)
# train_data = all_data[all_data['train'] == True]
# test_data = all_data[all_data['train'] == False]

In [None]:
import lightgbm as lgb
# 构建训练集和验证集

X = train_data.values # 说明：Id不是特征，SalePrice是标签，需要屏蔽
y = label.values # 标签 SalePrice
# X_train = X[0:int(len(X)*0.993),:]
# X_test = X[int(len(X)*0.993):,:]
# y_train = y[0:int(len(y)*0.993)]
# y_test = y[int(len(y)*0.993):]


# LGBR = lgb.LGBMRegressor(max_depth = 6, n_estimators=1000) # 基模型
# 训练/fit拟合
# LGBR.fit(X, y)

# 调节LGB的参数，使用网格搜索
# from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMRegressor(
    max_depth=4, 
    random_state = 47,
    n_estimators=1000, 
    n_jobs=-1,
    verbose=-1,
    verbosity=-1,
    learning_rate=0.02)
 
# param_grid = {
#     # 'max_depth':[3,4,5],
#     'num_leaves': [31,61,80],
#     'learning_rate': [0.1],
#     'n_estimators': [500，800]
# }
 
# clf2 = GridSearchCV(estimator, param_grid)
# clf2.fit(X,y1+0.1)
# clf2.fit(X_train,y_train)

estimator.fit(X,y)

# estimator.fit(X_train,y_train)
# y_pred = estimator.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test,y_pred)))    #F1-score达到0.95，有一定提高

LGBMRegressor(learning_rate=0.02, max_depth=4, n_estimators=1000,
              random_state=47, verbose=-1, verbosity=-1)

In [None]:
y_test_lgb = estimator.predict(test_data.values)

In [None]:
y_test_lgb

array([57.20160538, 57.15640649, 57.14502672, ..., 65.46377648,
       65.38054898, 65.47597184])

In [None]:
y_test

array([57.20160538, 57.15640649, 57.14502672, ..., 65.46377648,
       65.38054898, 65.47597184])

In [None]:
y_test

array([57.35470712, 57.41809997, 57.4018496 , ..., 64.99587125,
       64.95623369, 65.00315324])

In [None]:
y_test

array([57.69634732, 57.72909207, 57.74003213, ..., 65.50497174,
       65.52409629, 65.79627216])

In [None]:
# np.save("ans.csv",y_test)

In [None]:
y_test

array([57.70500421, 57.7102112 , 57.71002148, ..., 64.88913127,
       64.90213514, 65.29543384])

In [None]:
y_test

array([58.27606915, 58.27134572, 58.288343  , ..., 65.24683008,
       65.25364301, 65.81683208])

In [None]:
# y_test2 = model.predict(test_data.values)

In [None]:
# temp = pd.read_csv('submit273.csv')

In [None]:
y_test1 = y_test + 0.9

In [None]:
submit = pd.read_csv("result.csv")

In [None]:
workdir = "test/"
test = pd.read_csv("test/CO含量.csv")
for filename in os.listdir(workdir):
    temp = pd.read_csv(workdir + filename)
    test = pd.merge(test,temp,on=['时间'])

In [None]:
mysub = pd.DataFrame(test.iloc[:,0])
mysub["ID"] = range(1800)
# mysub['Steam_flow'] = (0.5 * y_test_cat + 0.5 * y_test_lgb) + 0.95
mysub['Steam_flow'] = y_test_lgb + 0.95
mysub.rename(columns={'时间':'Time'},inplace=True)
mysub = mysub.loc[:,['ID','Time','Steam_flow']]

In [None]:
mysub.to_csv("submit1192.csv",index = None)

In [None]:
!pip install catboost

In [None]:
import catboost as cat
# # 构建训练集和验证集
# X = all_data.values
# y = y.values # 标签 SalePrice

# X = train_data.values # 说明：Id不是特征，SalePrice是标签，需要屏蔽
# y = label.values # 标签 SalePrice
# # X_train = X[0:int(len(X)*0.993),:]
# # X_test = X[int(len(X)*0.993):,:]
# # y_train = y[0:int(len(y)*0.993)]
# # y_test = y[int(len(y)*0.993):]

# # X_train = X[0:int(len(X)*0.986),:]
# # X_test = X[int(len(X)*0.986):int(len(X)*0.993),:]
# # y_train = y[0:int(len(y)*0.986)]
# # y_test = y[int(len(X)*0.986):int(len(X)*0.993)]

# # LGBR = lgb.LGBMRegressor(max_depth = 6, n_estimators=1000) # 基模型
# # 训练/fit拟合
# # LGBR.fit(X, y)

# # 调节LGB的参数，使用网格搜索
# # from sklearn.model_selection import GridSearchCV

clf = cat.CatBoostRegressor(
    learning_rate=0.03,
    n_estimators=1500,
    # iterations=1000,
    depth=6,
    loss_function="RMSE",
    random_state=47
)
 
# # param_grid = {
# #     # 'max_depth':[3,4,5],
# #     'num_leaves': [31,61,80],
# #     'learning_rate': [0.1],
# #     'n_estimators': [500，800]
# # }
 
# # clf2 = GridSearchCV(estimator, param_grid)
# # clf2.fit(X,y1+0.1)
# # clf2.fit(X_train,y_train)

clf.fit(X,y)

# # estimator.fit(X_train,y_train)
# # y_pred = estimator.predict(X_test)
# # print(np.sqrt(mean_squared_error(y_test,y_pred)))    #F1-score达到0.95，有一定提高

In [None]:
y_test_cat = clf.predict(test_data.values)

In [None]:
y_test_cat

array([57.46293057, 57.52461902, 57.42802244, ..., 65.62156226,
       65.6823744 , 65.66650104])

In [None]:
y_test

array([57.36739206, 57.44436997, 57.40338348, ..., 65.28936027,
       65.27078714, 65.33943201])