In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time 
import os
import datetime
import matplotlib.dates as mdate
import matplotlib

from sklearn import datasets, linear_model
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

plt.rcParams['font.sans-serif']=['SimHei'] 
plt.rcParams['axes.unicode_minus']=False  

# STEP0:导入数据

In [None]:
"""
         数据包括四个站点的训练数据：train_1,train_2,train_3,train_4
         以及四个站点的测试数据:test_1,test_2,test_3,test_4
         参数：数据集文件所在的路径
  
"""
def GetData(data_path):
    print("Get Data Start")
    train_1=pd.read_csv(data_path+"train_1.csv")
    train_2=pd.read_csv(data_path+"train_2.csv")
    train_3=pd.read_csv(data_path+"train_3.csv")
    train_4=pd.read_csv(data_path+"train_4.csv")

    test_1=pd.read_csv(data_path+"test_1.csv")
    test_2=pd.read_csv(data_path+"test_2.csv")
    test_3=pd.read_csv(data_path+"test_3.csv")
    test_4=pd.read_csv(data_path+"test_4.csv")
    print("Get Data End")
    return train_1,train_2,train_3,train_4,test_1,test_2,test_3,test_4
 

# STEP1:认识数据

In [None]:
"""
      (1)各个字段基本信息（包括数据数量、数据类型、分位数均值基本统计特征）
      (3)查看目标变量的时间周期性
      (4)查看目标变量与其他字段的相关关系

"""
plt.rcParams['font.sans-serif']=['SimHei'] 
matplotlib.style.use("ggplot")
color1=dict(boxes='DarkGreen', whiskers='DarkOrange',medians='DarkBlue', caps='Gray')

def DataExploration(data1,station):
    data=data1.copy()     
    print(data.info())
    print(data.describe())
    
    #查看各连续变量的概率分布图以及箱线图
    data["实际功率"].plot.box(color=color1)
    plt.title(station)
    plt.show()
    #添加hour新字段
    data['hour']=data['时间'].apply(lambda x:int(str(x).split(" ")[1].split(":")[0]))
    
    #查看目标变量的时间周期性  
    data.loc[:2000,"实际功率"].plot(color="DarkGreen",fontsize=0.7)
    plt.title(station+"目标变量的时间趋势")
    plt.show()
    
    data.plot(x="hour",y="实际功率",kind="scatter",color="DarkGreen",fontsize=0.5)
    plt.title(station+"目标变量在各小时上的时间分布")
    plt.show()   
        
    #查看目标变量与其他字段的相关关系
    for i in data.columns[1:8]:
        sns.jointplot(i,"实际功率",data,kind="reg",color="DarkGreen")
        plt.show()

        
#查看站点三的离群点值
def  DataExploration2(data1):
    data=data1.copy()
    data['hour']=data['时间'].apply(lambda x:int(str(x).split(" ")[1].split(":")[0]))
    sns.countplot(data[data["实际功率"]>30.13125]["hour"],color="DarkGreen")
    plt.title("train_3实际功率离群点对应的时间",fontsize=10)
    plt.show()
    strange=data[data["实际功率"]>=30.13125]["实发辐照度"].rename("异常值对应的实发辐照度")
    normal=data[data["实际功率"]<30.13125]["实发辐照度"].rename("正常值对应的实发辐照度")
    sns.kdeplot(strange,color="DarkGreen")
    sns.kdeplot(normal,color='Red')
    plt.title("train_3实发辐照度分布",fontsize=10)
    plt.show()

In [None]:
STEP2：数据预处理

In [None]:
"""
   (1)删除重复值
  （2）删除实发辐照度异常的值
   (3) 删除站点2和站点3的异常值
  
"""

def DataProcessing1(data,name):
    data1=data.copy()
    data1['hour']=data1['时间'].apply(lambda x:int(str(x).split(" ")[1].split(":")[0]))
    print("Data Preprocessing Start")
    #删除重复值
    data1=data1.drop_duplicates()
    
    #将实发辐照度异常的数据删除
    data1=data1[data1['实发辐照度'] >= 0]
    
    #将站点1中的离群值删除
    if name=="station_1":
        data1=data1[data1["实际功率"]<10.4853]    
    # 将站点3中实发辐照度小于600，实际功率大于30.13125的值删除
    if name=="station_3":
        data1=data1[~((data1["实发辐照度"]<600)&(data1["实际功率"]>=30.13125))]
    
    #将站点2和站点3的异常值删除
    if name=="station_2":
        a=[0,1,2,3,4,5,20,21,22,23]
        data1=data1[~((data1.hour.isin(a))&(data1.实际功率==4.4))]
    if name=="station_3":
        a=[0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23]
        data1=data1[~((data1.hour.isin(a))&(data1.实际功率==29.667))]
            
    #将train_1的时间减少1秒
    if name=="station_1":
        print("站点一的时间处理开始")
        a=datetime.timedelta(seconds=1)
        data1["时间"]=pd.to_datetime(data1["时间"])-a  
    
    return data1



# STEP3:预测实发辐照度

In [None]:
def LGBModel(x_train, y_train,x_test,y_test):
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)  
    print("Model Trainning Start")
    params = {
      'task': 'train',
      'boosting_type': 'gbdt',  #dart bad
      'objective': 'regression_l1', 
      'metric': 'l1', 
      'num_leaves': 23,#22,23better,25bad
      'learning_rate': 0.05,  # 学习速率0.05,0.04bad,0.06bad
      'feature_fraction': 0.9, # 建树的特征选择比例
      'bagging_fraction': 0.8, # 建树的样本采样比例
      'bagging_freq': 10,  # k 意味着每 k 次迭代执行bagging
      'num_threads':1
    }
    gbm = lgb.train(params,lgb_train,num_boost_round=3500,valid_sets=lgb_eval,verbose_eval=1000)
    print('Start predicting...')
    y_pred = gbm.predict(x_test)
    return y_pred


# STEP4:特征工程

In [None]:
"""
    FeatureCreation1:
   （1）创建时间相关特征：month、day、hour、白天
   （2）辐照度、温度、湿度字段的每天mean、std、白天最大值、白天最小值、最大值最小值的差
   （3）将辐照度、温度、湿度、压强四个字段分段并转化为分类变量

"""
def FeatureCreation1(data1):
    
    data=data1.copy()
    #根据时间字段创建hour、month等特征
    data["month"]=data["时间"].apply(lambda x: int(str(x).split(" ")[0].split("-")[1]))
    data["hour"]=data["时间"].apply(lambda x:int(str(x).split(" ")[1].split(":")[0]))
    data["day"]=data["时间"].apply(lambda x:int(str(x).split(" ")[0].replace("-","")))
    data["白天"]=data["hour"].apply(lambda x:1 if x in range(7,20) else 0)    
    data=data.drop(["时间"],axis=1)
    
    #每天的辐照度，温度，湿度的mean，std特征
    columns=['辐照度','温度','湿度',"压强"]
    for col in columns:
        print("Create" + col +"_mean and std...")
        data[col+'_mean']=data.groupby('day')[col].transform('mean')
        data[col+'_std']=data.groupby('day')[col].transform('std')
        data[col+"_max"]=data.groupby(["day","白天"])[col].transform("max")
        data[col+"_min"]=data.groupby(["day","白天"])[col].transform("min")
        data[col+"_白天差"]=data[col+"_max"]-data[col+"_min"] 
        print("Create"+ col +"feature end")
   
    #创造其他特征——对分类变量进行one-hot编码
    Temp = pd.get_dummies(pd.cut(data['温度'],3),prefix = 'temp') 
    Temp_split = pd.DataFrame(np.argmax(Temp.values,axis = 1)).rename(columns={0: "temp_split"})
    data["temp_split"] = Temp_split.temp_split.values
    
    Pres = pd.get_dummies(pd.cut(data['压强'],3),prefix = 'pres') 
    Pres_split = pd.DataFrame(np.argmax(Pres.values,axis = 1)).rename(columns={0: "pres_split"})
    data["pres_split"] = Pres_split.pres_split.values
    
    Humi = pd.get_dummies(pd.cut(data['湿度'],3),prefix = 'humi') 
    Humi_split = pd.DataFrame(np.argmax(Humi.values,axis = 1)).rename(columns={0: "humi_split"})
    data["humi_split"] = Humi_split.humi_split.values
    
    H = pd.get_dummies(pd.cut(data['辐照度'],5),prefix = 'temp') 
    H_split = pd.DataFrame(np.argmax(H.values,axis = 1)).rename(columns={0: "h_split"})
    data["H_split"] = H_split.h_split.values
    return data

def FeatureCreation2(data1):
     
    data=data1.copy()
    #创造业务相关特征：包括风向风力特征和温度特征压强
    data["sin_wind_d"]=np.sin(data.风向.values * np.pi / 180.)*(data.风速.values)
    data['cos_wind_d'] = np.cos(data.风向.values * np.pi / 180.)*(data.风速.values)
    
    data['temp_pressure'] = (data.压强.values+1.1)/(data.温度.values+1.1)
    data['temp_humi'] = (data.温度.values+1.1)/(data.湿度.values+1.1)
    return data


def FeatureCreation3(data1):
    
    data=data1.copy()
    #使用前一时刻的对应字段值作为特征
    temp = list(data['实发辐照度'])
    temp = [-1.0] + [-1.0] +[-1.0] + temp + [-1.0] + [-1.0] +[-1.0]
    data['before_f'] = np.array(temp[2:-4])
    data['before_f1'] = np.array(temp[1:-5])
    data['before_f2'] = np.array(temp[:-6])
    temp1=list(data["温度"])
    temp1 =[-1.0] + [-1.0] +[-1.0]+ temp1 +[-1.0] + [-1.0] +[-1.0]
    data['temp_before_f'] = np.array(temp1[2:-4])
    data['temp_before_f1'] = np.array(temp1[1:-5])
    data['temp_before_f2'] = np.array(temp1[:-6])
    temp2=list(data["湿度"])
    temp2 =[-1.0] + [-1.0] +[-1.0]+ temp2 + [-1.0] + [-1.0] +[-1.0]
    data['humi_before_f'] = np.array(temp2[2:-4])
    data['humi_before_f1'] = np.array(temp2[1:-5])
    data['humi_before_f2'] = np.array(temp2[:-6])
    temp3=list(data["辐照度"])
    temp3 = [-1.0] + [-1.0] +[-1.0] + temp3 + [-1.0] + [-1.0] +[-1.0]
    data['irr_before_f'] = np.array(temp3[2:-4])
    data['irr_before_f1'] = np.array(temp3[1:-5])
    data['irr_before_f2'] = np.array(temp3[:-6])
    temp4=list(data["风速"])
    temp4 = [-1.0] + [-1.0] +[-1.0] + temp4 + [-1.0] + [-1.0] +[-1.0]
    data['speed_before_f'] = np.array(temp4[2:-4])
    data['spedd_before_f1'] = np.array(temp4[1:-5])
    data['speed_before_f2'] = np.array(temp4[:-6])
    return data 

# STEP5:模型优化

In [None]:
"""
    用梯度搜索方法调参，找到模型的最优参数
    参数：model代表模型，params代表用于梯度搜索的参数范围；
"""
def ModelOptimization(model, params, x_train, y_train):
    print("Model Optimizatioin Start")
    #x_train = x_train.fillna(0)
    best_params = []
    for param in params:
        print("Optimize param", param, "...")
        cv = GridSearchCV(estimator = model, param_grid = param, scoring = "neg_mean_absolute_error", cv = 3, n_jobs = -1)
        cv.fit(x_train, y_train)
        best_params.append(cv.best_params_)
    print("Model Optimizatioin Done")
    return best_params

# STEP6:模型评估

In [None]:
"""

  通过交叉验证评估模型
  score函数参数：y_pred是预测的y值，y是含有day属性的真实标签值，name是指站点一、二、三、四

"""

def scorey(y_pred,y,name):
    y["y_pred"]=y_pred
    d=y
    if name=="y_station1":
        d=d[d["y_pred"]>10*0.03]
    if name=="y_station2":
        d=d[d["y_pred"]>10*0.03]
    if name=="y_station3":
        d=d[d["y_pred"]>40*0.03]
    if name=="y_station4":
        d=d[d["y_pred"]>50*0.03]
    mae_d=d.groupby("day").apply(lambda x:(abs(x['y_pred']-x["实际功率"])/10).mean()).reset_index()
    mae_d.columns=["day","mae_d"]
    b=mae_d
    b["month"]=mae_d["day"].apply(lambda x:int(str(x)[:6]))
    b=b.drop("day",axis=1)
    mae_m=b.groupby("month")["mae_d"].apply(lambda x:x.mean()).reset_index()
    mae_m.columns=["month","mae_m"]
    mae_mmean=mae_m["mae_m"].mean()
    return mae_mmean


"""
  记录验证的得分以及对应的模型

"""
def record(x_train,model,mae_score):
    print("Record Start")
    with open("../Records/Records.txt", "a") as f:
        f.write("features:\t")
        f.write("[" + ", ".join(x_train.columns.values) + "]")
        f.write("\n\n")
        f.write("model:\t")
        f.write(str(model))
        f.write("\n\n")
        f.write("mae_score:\t")
        f.write(str(mae_score))
        f.write("\n")
        f.write("#" * 100)
        f.write("\n")
    print("Record Done")

导入数据

In [None]:
#获取数据
t_get_data_start=time.time()
train_1,train_2,train_3,train_4,test_1,test_2,test_3,test_4=GetData("../Data/")
t_get_data_end=time.time()

In [None]:
站点一预测

In [None]:
#数据探索

#DataExploration(train_1,"station_1")


#数据预处理
train_1_new=DataProcessing1(train_1,"station_1")

#预测实发辐照度
t_predict_shifa_start = time.time()
train_shifa_x=train_1_new[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
train_shifa_x["month"]=train_shifa_x["时间"].apply(lambda x: int(str(x).split(" ")[0].split("-")[1]))
train_shifa_x["hour"]=train_shifa_x["时间"].apply(lambda x:int(str(x).split(" ")[1].split(":")[0]))
train_shifa_x=train_shifa_x.drop(["时间"],axis=1)
train_shifa_y=train_1_new["实发辐照度"]
test_shifa_x=test_1[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
test_shifa_x["month"]=test_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
test_shifa_x["hour"]=test_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
test_shifa_x=test_shifa_x.drop(["时间"],axis=1)
test_1["实发辐照度"]=0
test_shifa_y=test_1.pop("实发辐照度")

test_shifa_y=LGBModel(train_shifa_x,train_shifa_y,test_shifa_x,test_shifa_y)
test_1["实发辐照度"]=test_shifa_y


#特征工程


train_1_y=train_1_new.pop("实际功率")
train_1_feature1=FeatureCreation1(train_1_new)
train_1_feature2=FeatureCreation2(train_1_feature1)
train_1_feature3=FeatureCreation3(train_1_feature2)
test_1_id=test_1.pop("id")
test_1_feature1=FeatureCreation1(test_1)
test_1_feature2=FeatureCreation2(test_1_feature1)
test_1_feature3=FeatureCreation3(test_1_feature2)

train_1_feature1=train_1_feature1.reset_index().drop(["index"],axis=1)
train_1_y=train_1_y.reset_index().drop(["index"],axis=1)
train_1_y["day"]=train_1_feature1.pop("day")
train_1_feature1=train_1_feature1.fillna(0)

#模型优化（特征工程第一次）

#Ridge_params=[{"alpha":[0.1,1,2,5,10]}]
#scaler=StandardScaler()
#train_1_ridge=scaler.fit_transform(train_1_feature1.values)
#Ridge_best_params = ModelOptimization(Ridge(), Ridge_params, train_1_ridge, train_1_y["实际功率"])

#xgb_params = [{"learning_rate": [0.01,0.02, 0.05, 0.1, 0.3, 1.0]}, 
#               {"reg_lambda":[0.05,0.1,1,2,10]},
#              {"n_estimators": [100, 150,200,250,300, 400,500]}, 
#              {"max_depth": range(3,10,2)}, 
#               {"min_child_weight": range(1,6,1)}, 
#               {"gamma": [i/10.0 for i in range(0,5)]}, 
#               {"subsample": [i/10.0 for i in range(6,10)]},
#               {"colsample_bytree": [i/10.0 for i in range(5,10)]}, 
#               {"reg_alpha": [1e-5, 1e-2, 0.1, 100]}]
#xgb_best_params = ModelOptimization(XGBRegressor(), xgb_params, train_1_feature1, train_1_y["实际功率"])

#模型评估（特征工程第一次）
train_1_x,test_1_x,train_1_y_new,test_1_y=train_test_split(train_1_feature1,train_1_y,test_size=0.2,random_state=0)

ridge=Ridge(alpha=10)
scaler = StandardScaler()
train_1_ridge_x = scaler.fit_transform(train_1_x.values)
test_1_ridge_x = scaler.transform(test_1_x.values)
ridge.fit(train_1_ridge_x,train_1_y_new["实际功率"])
y_pred_ridge =ridge.predict(test_1_ridge_x)
ridge_mae_score=scorey(y_pred_ridge,test_1_y,"station_1")
record(train_1_feature1, ridge,ridge_mae_score)

ridgecv = RidgeCV(alphas=np.arange(0.1,10,0.1))
ridgecv.fit(train_1_ridge_x,train_1_y_new["实际功率"])
y_pred_ridgecv=ridgecv.predict(test_1_ridge_x)
ridgecv_mae_score=scorey(y_pred_ridgecv,test_1_y,"station_1")
record(train_1_feature1, ridgecv,ridgecv_mae_score)

xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 5, min_child_weight = 5, 
                    gamma = 0.4, subsample = 0.9, colsample_bytree = 0.8, reg_alpha = 0.1)
xgb.fit(train_1_x,train_1_y_new["实际功率"])
y_pred_xgb=xgb.predict(test_1_x)
xgb_mae_score= scorey(y_pred_xgb,test_1_y,"station_1")
record(train_1_feature1, xgb,xgb_mae_score)

#模型优化（特征工程第二次）
#train_1_feature2=train_1_feature2.fillna(0)
#train_1_feature2=train_1_feature2.drop(["day"],axis=1)

#Ridge_params=[{"alpha":[0.1,1,2,5,10]}]
#scaler=StandardScaler()
#train_1_ridge=scaler.fit_transform(train_1_feature2.values)
#Ridge_best_params = ModelOptimization(Ridge(), Ridge_params, train_1_ridge, train_1_y["实际功率"])

#xgb_params = [{"learning_rate": [0.01,0.02, 0.05, 0.1, 0.3, 1.0]}, 
#               {"reg_lambda":[0.05,0.1,1,2,10]},
#              {"n_estimators": [100, 150,200,250,300, 400,500]}, 
#              {"max_depth": range(3,10,2)}, 
#               {"min_child_weight": range(1,6,1)}, 
#               {"gamma": [i/10.0 for i in range(0,5)]}, 
#               {"subsample": [i/10.0 for i in range(6,10)]},
#               {"colsample_bytree": [i/10.0 for i in range(5,10)]}, 
#               {"reg_alpha": [1e-5, 1e-2, 0.1, 100]}]
#xgb_best_params = ModelOptimization(XGBRegressor(), xgb_params, train_1_feature2, train_1_y["实际功率"])

#模型评估（特征工程第二次）
train_1_x,test_1_x,train_1_y_new,test_1_y=train_test_split(train_1_feature2,train_1_y,test_size=0.2,random_state=0)
ridge=Ridge(alpha=10)
scaler = StandardScaler()
train_1_ridge_x = scaler.fit_transform(train_1_x.values)
test_1_ridge_x = scaler.transform(test_1_x.values)
ridge.fit(train_1_ridge_x,train_1_y_new["实际功率"])
y_pred_ridge =ridge.predict(test_1_ridge_x)
ridge_mae_score=scorey(y_pred_ridge,test_1_y,"station_1")
record(train_1_feature1, ridge,ridge_mae_score)

ridgecv = RidgeCV(alphas=np.arange(0.1,10,0.1))
ridgecv.fit(train_1_ridge_x,train_1_y_new["实际功率"])
y_pred_ridgecv=ridgecv.predict(test_1_ridge_x)
ridgecv_mae_score=scorey(y_pred_ridgecv,test_1_y,"station_1")
record(train_1_feature1, ridgecv,ridgecv_mae_score)

xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 7, min_child_weight = 5, 
                    gamma = 0.4, subsample = 0.9, colsample_bytree = 0.8, reg_alpha = 0.1)
xgb.fit(train_1_x,train_1_y_new["实际功率"])
y_pred_xgb=xgb.predict(test_1_x)
xgb_mae_score= scorey(y_pred_xgb,test_1_y,"station_1")
record(train_1_feature2, xgb,xgb_mae_score)


#模型优化（特征工程第三次）
#train_1_feature3=train_1_feature3.fillna(0)
#train_1_feature3=train_1_feature3.drop(["day"],axis=1)

#Ridge_params=[{"alpha":[0.1,1,2,5,10]}]
#scaler=StandardScaler()
#train_1_ridge=scaler.fit_transform(train_1_feature3.values)
#Ridge_best_params = ModelOptimization(Ridge(), Ridge_params, train_1_ridge, train_1_y["实际功率"])

#xgb_params = [{"learning_rate": [0.01,0.02, 0.05, 0.1, 0.3, 1.0]}, 
#               {"reg_lambda":[0.05,0.1,1,2,10]},
#              {"n_estimators": [100, 150,200,250,300, 400,500]}, 
#              {"max_depth": range(3,10,2)}, 
#               {"min_child_weight": range(1,6,1)}, 
#               {"gamma": [i/10.0 for i in range(0,5)]}, 
#               {"subsample": [i/10.0 for i in range(6,10)]},
#               {"colsample_bytree": [i/10.0 for i in range(5,10)]}, 
#               {"reg_alpha": [1e-5, 1e-2, 0.1, 100]}]
#xgb_best_params = ModelOptimization(XGBRegressor(), xgb_params, train_1_feature3, train_1_y["实际功率"])

#模型评估（特征工程第三次）
train_1_x,test_1_x,train_1_y_new,test_1_y=train_test_split(train_1_feature3,train_1_y,test_size=0.2,random_state=0)
ridge=Ridge(alpha=10)
scaler = StandardScaler()
train_1_ridge_x = scaler.fit_transform(train_1_x.values)
test_1_ridge_x = scaler.transform(test_1_x.values)
ridge.fit(train_1_ridge_x,train_1_y_new["实际功率"])
y_pred_ridge =ridge.predict(test_1_ridge_x)
ridge_mae_score=scorey(y_pred_ridge,test_1_y,"station_1")
record(train_1_feature1, ridge,ridge_mae_score)

ridgecv = RidgeCV(alphas=np.arange(0.1,10,0.1))
ridgecv.fit(train_1_ridge_x,train_1_y_new["实际功率"])
y_pred_ridgecv=ridgecv.predict(test_1_ridge_x)
ridgecv_mae_score=scorey(y_pred_ridgecv,test_1_y,"station_1")
record(train_1_feature1, ridgecv,ridgecv_mae_score)


xgb = XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 7, min_child_weight = 2, 
                    gamma = 0.4, subsample = 0.9, colsample_bytree = 0.9, reg_alpha = 0.1)
xgb.fit(train_1_x,train_1_y_new["实际功率"])
y_pred_xgb=xgb.predict(test_1_x)
xgb_mae_score= scorey(y_pred_xgb,test_1_y,"station_1")
record(train_1_feature3, xgb,xgb_mae_score)

In [None]:
站点二预测

In [None]:
#数据探索
#DataExploration(train_2)

#数据预处理
train_2_new=DataProcessing1(train_2,"station_2")
#预测实发辐照度
train_2_shifa_x=train_2_new[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
train_2_shifa_x["month"]=train_2_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
train_2_shifa_x["hour"]=train_2_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
train_2_shifa_x=train_2_shifa_x.drop(["时间"],axis=1)
train_2_shifa_y=train_2_new["实发辐照度"]

test_2_shifa_x=test_2[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
test_2_shifa_x["month"]=test_2_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
test_2_shifa_x["hour"]=test_2_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
test_2_shifa_x=test_2_shifa_x.drop(["时间"],axis=1)
test_2["实发辐照度"]=0
test_2_shifa_y=test_2.pop("实发辐照度")

test_2_shifa_y=LGBModel(train_2_shifa_x,train_2_shifa_y,test_2_shifa_x,test_2_shifa_y)
test_2["实发辐照度"]=test_2_shifa_y


#特征工程


train_2_y=train_2_new.pop("实际功率")
train_2_feature1=FeatureCreation1(train_2_new)
train_2_feature2=FeatureCreation2(train_2_feature1)
train_2_feature3=FeatureCreation3(train_2_feature2)
test_2_id=test_2.pop("id")
test_2_feature1=FeatureCreation1(test_2)
test_2_feature2=FeatureCreation2(test_2_feature1)
test_2_feature3=FeatureCreation3(test_2_feature2)

train_2_feature3=train_2_feature3.reset_index().drop(["index"],axis=1)
train_2_y=train_2_y.reset_index().drop(["index"],axis=1)
train_2_y["day"]=train_2_feature3.pop("day")
train_2_feature3=train_2_feature3.fillna(0)



#模型优化

#xgb_params = [{"learning_rate": [0.01,0.02, 0.05, 0.1, 0.3, 1.0]}, 
#              {"reg_lambda":[0.05,0.1,1,2,10]},
#              {"n_estimators": [100, 150,200,250,300, 400,500]}, 
#              {"max_depth": range(3,10,2)}, 
#              {"min_child_weight": range(1,6,1)}, 
#              {"gamma": [i/10.0 for i in range(0,5)]}, 
#              {"subsample": [i/10.0 for i in range(6,10)]},
#              {"colsample_bytree": [i/10.0 for i in range(5,10)]}, 
#              {"reg_alpha": [1e-5, 1e-2, 0.1, 100]}]
#xgb_best_params_2 = ModelOptimization(XGBRegressor(), xgb_params,train_2_feature3,train_2_y["实际功率"])



#模型评估


train_2_x,test_2_x,train_2_y_new,test_2_y=train_test_split(train_2_feature3,train_2_y,test_size=0.2,random_state=0)
xgb2 = XGBRegressor(n_estimators = 100, learning_rate = 0.1, max_depth = 7, min_child_weight = 1, 
                    gamma = 0.1, subsample = 0.8, colsample_bytree = 0.8, reg_alpha = 1e-05)
xgb2.fit(train_2_x,train_2_y_new["实际功率"])
y_pred2=xgb2.predict(test_2_x)
xgb_mae_score_2= scorey(y_pred2,test_2_y,"station_2")
xgb_mae_score_2


In [None]:
站点三

In [None]:
#数据探索

#DataExploration(train_3)
#DataExploration2(train_3)


#数据预处理
train_3_new=DataProcessing1(train_3,"station_3")
#预测实发辐照度
train_3_shifa_x=train_3_new[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
train_3_shifa_x["month"]=train_3_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
train_3_shifa_x["hour"]=train_3_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
train_3_shifa_x=train_3_shifa_x.drop(["时间"],axis=1)
train_3_shifa_y=train_3_new["实发辐照度"]

test_3_shifa_x=test_3[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
test_3_shifa_x["month"]=test_3_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
test_3_shifa_x["hour"]=test_3_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
test_3_shifa_x=test_3_shifa_x.drop(["时间"],axis=1)
test_3["实发辐照度"]=0
test_3_shifa_y=test_3.pop("实发辐照度")

test_3_shifa_y=LGBModel(train_3_shifa_x,train_3_shifa_y,test_3_shifa_x,test_3_shifa_y)
test_3["实发辐照度"]=test_3_shifa_y


#特征工程


train_3_y=train_3_new.pop("实际功率")
train_3_feature1=FeatureCreation1(train_3_new)
train_3_feature2=FeatureCreation2(train_3_feature1)
train_3_feature3=FeatureCreation3(train_3_feature2)
test_3_id=test_3.pop("id")
test_3_feature1=FeatureCreation1(test_3)
test_3_feature2=FeatureCreation2(test_3_feature1)
test_3_feature3=FeatureCreation3(test_3_feature2)

train_3_feature3=train_3_feature3.reset_index().drop(["index"],axis=1)
train_3_y=train_3_y.reset_index().drop(["index"],axis=1)
train_3_y["day"]=train_3_feature3.pop("day")
train_3_feature3=train_3_feature3.fillna(0)



#模型优化

#xgb_params = [{"learning_rate": [0.01,0.02, 0.05, 0.1, 0.3, 1.0]}, 
#              {"reg_lambda":[0.05,0.1,1,2,10]},
#              {"n_estimators": [100, 150,200,250,300, 400,500]}, 
#              {"max_depth": range(3,10,2)}, 
#              {"min_child_weight": range(1,6,1)}, 
#              {"gamma": [i/10.0 for i in range(0,5)]}, 
#              {"subsample": [i/10.0 for i in range(6,10)]},
#              {"colsample_bytree": [i/10.0 for i in range(5,10)]}, 
#              {"reg_alpha": [1e-5, 1e-2, 0.1, 100]}]
#xgb_best_params_3 = ModelOptimization(XGBRegressor(), xgb_params,train_3_feature3,train_3_y["实际功率"])



#模型评估


train_3_x,test_3_x,train_3_y_new,test_3_y=train_test_split(train_3_feature3,train_3_y,test_size=0.2,random_state=0)
xgb3 = XGBRegressor(n_estimators = 100, learning_rate = 0.05, max_depth = 9, min_child_weight = 4, 
                    gamma = 0.4, subsample = 0.9, colsample_bytree = 0.9, reg_alpha = 100)
xgb3.fit(train_3_x,train_3_y_new["实际功率"])
y_pred3=xgb3.predict(test_3_x)
xgb_mae_score_3= scorey(y_pred3,test_3_y,"station_3")
xgb_mae_score_3

In [None]:
站点四

In [None]:
#数据探索

#DataExploration(train_4)


#数据预处理
train_4_new=DataProcessing1(train_4,"station_4")
#预测实发辐照度
train_4_shifa_x=train_4_new[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
train_4_shifa_x["month"]=train_4_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
train_4_shifa_x["hour"]=train_4_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
train_4_shifa_x=train_4_shifa_x.drop(["时间"],axis=1)
train_4_shifa_y=train_4_new["实发辐照度"]

test_4_shifa_x=test_4[['辐照度', '风速','风向', '温度',  '压强','湿度', '时间']]
test_4_shifa_x["month"]=test_4_shifa_x["时间"].apply(lambda x: int(x.split(" ")[0].split("-")[1]))
test_4_shifa_x["hour"]=test_4_shifa_x["时间"].apply(lambda x:int(x.split(" ")[1].split(":")[0]))
test_4_shifa_x=test_4_shifa_x.drop(["时间"],axis=1)
test_4["实发辐照度"]=0
test_4_shifa_y=test_4.pop("实发辐照度")

test_4_shifa_y=LGBModel(train_4_shifa_x,train_4_shifa_y,test_4_shifa_x,test_4_shifa_y)
test_4["实发辐照度"]=test_4_shifa_y


#特征工程


train_4_y=train_4_new.pop("实际功率")
train_4_feature1=FeatureCreation1(train_4_new)
train_4_feature2=FeatureCreation2(train_4_feature1)
train_4_feature3=FeatureCreation3(train_4_feature2)
test_4_id=test_4.pop("id")
test_4_feature1=FeatureCreation1(test_4)
test_4_feature2=FeatureCreation2(test_4_feature1)
test_4_feature3=FeatureCreation3(test_4_feature2)

train_4_feature3=train_4_feature3.reset_index().drop(["index"],axis=1)
train_4_y=train_4_y.reset_index().drop(["index"],axis=1)
train_4_y["day"]=train_4_feature3.pop("day")
train_4_feature3=train_4_feature3.fillna(0)




#xgb_params = [{"learning_rate": [0.01,0.02, 0.05, 0.1, 0.3, 1.0]}, 
#              {"reg_lambda":[0.05,0.1,1,2,10]},
#              {"n_estimators": [100, 150,200,250,300, 400,500]}, 
#              {"max_depth": range(3,10,2)}, 
#              {"min_child_weight": range(1,6,1)}, 
#              {"gamma": [i/10.0 for i in range(0,5)]}, 
#              {"subsample": [i/10.0 for i in range(6,10)]},
#              {"colsample_bytree": [i/10.0 for i in range(5,10)]}, 
#              {"reg_alpha": [1e-5, 1e-2, 0.1, 100]}]
#xgb_best_params_4 = ModelOptimization(XGBRegressor(), xgb_params,train_4_feature3,train_4_y["实际功率"])



train_4_x,test_4_x,train_4_y_new,test_4_y=train_test_split(train_4_feature3,train_4_y,test_size=0.2,random_state=0)
xgb4 = XGBRegressor(n_estimators = 100, learning_rate = 0.05, max_depth = 9, min_child_weight = 4, 
                    gamma = 0.4, subsample = 0.9, colsample_bytree = 0.9, reg_alpha = 100)
xgb4.fit(train_4_x,train_4_y_new["实际功率"])
y_pred4=xgb4.predict(test_4_x)
xgb_mae_score_4= scorey(y_pred4,test_4_y,"station_4")
xgb_mae_score_4

In [None]:
test_1_feature3.columns

In [None]:
拟合预测并生成结果文件

In [None]:
test_1_feature3=test_1_feature3.drop(["day"],axis=1)
test_2_feature3=test_2_feature3.drop(["day"],axis=1)
test_3_feature3=test_3_feature3.drop(["day"],axis=1)
test_4_feature3=test_4_feature3.drop(["day"],axis=1)
cols=['辐照度', '风速', '风向', '温度', '压强', '湿度', '实发辐照度', 'hour', 'month', '白天', '辐照度_mean', '辐照度_std', '辐照度_max', '辐照度_min', '辐照度_白天差', '温度_mean', '温度_std', '温度_max', '温度_min', '温度_白天差', '湿度_mean', '湿度_std', '湿度_max', '湿度_min', '湿度_白天差', '压强_mean', '压强_std', '压强_max', '压强_min', '压强_白天差', 'temp_split', 'pres_split', 'humi_split', 'H_split', 'sin_wind_d', 'cos_wind_d', 'temp_pressure', 'temp_humi', 'before_f', 'before_f1', 'before_f2', 'temp_before_f', 'temp_before_f1', 'temp_before_f2', 'humi_before_f', 'humi_before_f1', 'humi_before_f2', 'irr_before_f', 'irr_before_f1', 'irr_before_f2', 'speed_before_f', 'spedd_before_f1', 'speed_before_f2']

test_2_feature3=test_2_feature3.loc[:,cols]
test_3_feature3=test_3_feature3.loc[:,cols]
test_4_feature3=test_4_feature3.loc[:,cols]

y_pred1=xgb.predict(test_1_feature3)
y_pred2 =xgb2.predict(test_2_feature3)
y_pred3 =xgb3.predict(test_3_feature3)
y_pred4 =xgb4.predict(test_4_feature3)

y1_sub=pd.DataFrame({"id":list(test_1_id),"predicition":list(y_pred1)})
y2_sub=pd.DataFrame({"id":list(test_2_id),"predicition":list(y_pred2)})
y3_sub=pd.DataFrame({"id":list(test_3_id),"predicition":list(y_pred3)})
y4_sub=pd.DataFrame({"id":list(test_4_id),"predicition":list(y_pred4)})
submission=pd.concat([y1_sub,y2_sub,y3_sub,y4_sub],axis=0).set_index(["id"])
submission.to_csv("../Submission/Submission.csv")