# Model

## Log control

In [123]:
import logging
def startLog(modelName):
    logger = logging.getLogger(__name__)
    logger.info(f"Start running {modelName} model")
    
def finishLog(modelName, val_mse, fileName):
    logger = logging.getLogger(__name__)
    logger.info(f"MSE of Validation Set: {val_mse}")
    logger.info(f"Add file name:{fileName}")
    logger.info(f"Finish running {modelName} model")

## Model control

In [124]:
from sklearn.metrics import mean_squared_error
def controlModel(X_train, X_val, Y_train, Y_val, random_key, model, modelName):
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_val)
    
    val_mse = mean_squared_error(Y_val, Y_pred)
    
    fileName = f"{modelName}_{int(val_mse)}.csv"
    
    return Y_pred, val_mse, fileName, model

# Models

In [125]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

def runXGBoostRegressor(X_train, X_val, Y_train, Y_val, random_key, modelName, parameters={}):
    if parameters:
        n_estimators = parameters["n_estimators"]
        max_depth = parameters["max_depth"]
        learning_rate = parameters["learning_rate"]
    else:
        n_estimators = 400
        max_depth = 6
        learning_rate = 0.1
        
    startLog(modelName)
    logger = logging.getLogger(__name__)
    logger.info(f"parameters: n_estimators:{n_estimators}, max_depth:{max_depth}, learning_rate:{learning_rate}")

    model = XGBRegressor(
        objective='reg:squarederror',  # 目標函數，使用平方誤差
        n_estimators=n_estimators,              # 樹的數量
        max_depth=max_depth,                   # 樹的最大深度
        learning_rate=learning_rate,             # 學習率
        subsample=0.8,                 # 子採樣比率
        colsample_bytree=0.8,          # 每棵樹的列採樣比率
        random_state=random_key,       # 控制隨機性
        device = "cuda:0"
    )

    Y_pred, val_mse, fileName, model = controlModel(X_train, X_val, Y_train, Y_val, random_key, model, modelName)

    print('訓練集: ',model.score(X_train,Y_train))
    print('測試集: ',model.score(X_val,Y_val))
    finishLog(modelName, val_mse, fileName)

    return model, Y_pred, fileName, val_mse

from sklearn.neighbors import KNeighborsRegressor
def runKNN(X_train, X_val, Y_train, Y_val, random_key, modelName):
    startLog(modelName)

    model = KNeighborsRegressor(
        n_neighbors=95,             # 鄰居數量(1~無窮大)
        weights='uniform',          # 鄰居加權方式("uniform" 或 "distance")
        algorithm='auto',           # 搜索算法("auto", "ball_tree", "kd_tree", "brute")
        leaf_size=30,               # 樹的葉節點大小(1~無窮大)
        p=2                         # 距離度量方式(1=曼哈頓距離, 2=歐幾里得距離)
    )
    
    Y_pred, val_mse, fileName, model = controlModel(X_train, X_val, Y_train, Y_val, random_key, model, modelName)
    
    print('訓練集: ',model.score(X_train,Y_train))
    print('測試集: ',model.score(X_val,Y_val))
    finishLog(modelName, val_mse, fileName)

    return model, Y_pred, fileName, val_mse

# In[] Decision Tree Regression model
from sklearn.tree import DecisionTreeRegressor

def runDecisionTreeRegressor(X_train, X_val, Y_train, Y_val, random_key, modelName):
    startLog(modelName)

    model = DecisionTreeRegressor(
        criterion='squared_error',   # 測量分裂品質的標準("squared_error" 或 "absolute_error")
        splitter='best',             # 選擇分裂點的方法("best" 或 "random")
        max_depth=14,                # 樹的最大深度(1~無窮大)
        min_samples_split=300,         # 分裂內部節點所需的最小樣本數(2~無窮大)
        min_samples_leaf=20,          # 葉子節點所需的最小樣本數(1~無窮大)
        max_features=None,           # 用於分裂的特徵數量(None 或 1~特徵數量)
        random_state=random_key      # 控制隨機性(整數)
    )
    
    Y_pred, val_mse, fileName, model = controlModel(X_train, X_val, Y_train, Y_val, random_key, model, modelName)
    
    print('訓練集: ',model.score(X_train,Y_train))
    print('測試集: ',model.score(X_val,Y_val))
    finishLog(modelName, val_mse, fileName)

    return model, Y_pred, fileName, val_mse



# Preprocessing

In [126]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
def decomposition(dataset, x_columns, y_columns=None):
    X = dataset[x_columns]
    Y = dataset[y_columns] if y_columns else None
    return (X, Y) if Y is not None else X

def split_test_data(dataset, isMean=True):
    dataset['y'] = dataset['序號'].astype(str).str[0:4].astype(int).squeeze()
    dataset['mo'] = dataset['序號'].astype(str).str[4:6].astype(int).squeeze()
    dataset['d'] = dataset['序號'].astype(str).str[6:8].astype(int).squeeze()
    dataset['h'] = dataset['序號'].astype(str).str[8:10].astype(int).squeeze()
    dataset['min'] = dataset['序號'].astype(str).str[10:12].astype(int).squeeze()
    dataset['LocationCode'] = dataset['序號'].astype(str).str[12:14].astype(int).squeeze()
    
    sequence = dataset["序號"]
    
    if isMean:
        datasets = []
        for i in range(10):
            dataset_cp = dataset.copy()
            dataset_cp['min'] = dataset['min'] + i
            datasets.append(dataset_cp)
        dataset = pd.concat(datasets, ignore_index=True)
    return dataset, sequence

def split_date_time(dataset) ->  pd.DataFrame:
    dataset['DateTime'] = pd.to_datetime(dataset['DateTime'])
    
    dataset['y'] = dataset['DateTime'].dt.year.squeeze()
    dataset['mo'] = dataset['DateTime'].dt.month.squeeze()
    dataset['d'] = dataset['DateTime'].dt.day.squeeze()
    dataset['h'] = dataset['DateTime'].dt.hour.squeeze()
    dataset['min'] = dataset['DateTime'].dt.minute.squeeze()
    
    dataset = dataset.drop(columns=['DateTime'])
    
    return  dataset


def concat_dataset(directory:str) -> pd.DataFrame:
    datasets = pd.DataFrame()
    for counter, file_name in enumerate(os.listdir(directory)):
        dataset = pd.read_csv(os.path.join(directory,file_name) )
        datasets = pd.concat([datasets,dataset],ignore_index=True)
    return datasets

def preprocessing(train_dirs, X_cols, Y_cols, test_set_size=0.2, random_key=0):
    dataset = concat_dataset(train_dirs)
    dataset = split_date_time(dataset)
    X, Y = decomposition(dataset, x_columns=X_cols, y_columns=Y_cols)
    
    X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=test_set_size, random_state=random_key)
    
    return X_train, X_val, Y_train, Y_val


# Run

## Parameters

In [127]:
logging.basicConfig(level=logging.INFO, filemode='w')
logger = logging.getLogger(__name__)

train_dirs = "dataset"
test_dir = "submit/test_data.csv"
result_dir = "submit/indirect4_"

basic_columns = ['y', 'mo', 'd', 'h', 'min', 'LocationCode']
feature_columns_2 = ['Temperature(°C)', 'Humidity(%)']
feature_columns_3 = ['Sunlight(Lux)']
target_columns = ['Power(mW)']

val_set_size = 0.0000001

model_names = ["XGBoost", "XGBoost", "XGBoost", "XGBoost"]

isMean = True

logger.info("Starting split testing set")
if isMean:
    result_dir+="Average"
else:
    result_dir+="noAverage"
    
test_dataset = pd.read_csv(test_dir)
test_dataset= test_dataset.drop(["答案"],axis=1)
test_dataset, sequence = split_test_data(test_dataset, isMean)

logger.info("Starting indirect prediction with 4 layers")


INFO:__main__:Starting split testing set
INFO:__main__:Starting indirect prediction with 4 layers


## Layer 1

In [128]:
# random_key = 48763 
# logger.info("Starting preprocessing in layer 1")
# feature_colum = basic_columns 
# X_train, X_val, Y_train, Y_val = preprocessing(train_dirs, feature_colum, feature_columns_1, val_set_size, random_key)
# logger.info("Starting model training in layer 1")
# parameters = { "n_estimators" : 600 , "max_depth" : 8 , "learning_rate" : 0.1}
# random_key = 48763 
# model, Y_pred, filename, val_mse = runXGBoostRegressor(X_train, X_val, Y_train, Y_val, random_key, model_names[0],parameters)
# result_dir = result_dir + filename
# logger.info("Starting prediction on test set in layer 1")
# X_pred = test_dataset[feature_colum]
# Y_test = pd.DataFrame(model.predict(X_pred), columns=feature_columns_1)
# test_dataset = pd.concat([test_dataset, Y_test], axis=1)

n_estimators:1800, max_depth:8, learning_rate:0.1 33 0.62 0.43
0.62 0.83

0.3
訓練集:  0.6447689533233643
測試集:  0.8108588457107544
0.4
訓練集:  0.6466488838195801
測試集:  0.8208843469619751
0.05
訓練集:  0.5954268574714661
測試集:  0.8237078189849854

4334
n_estimators:1800, max_depth:8, learning_rate:0.1 0
0
n_estimators:1800, max_depth:8,  learning_rate:0.1  0.2187 0.62 0.96
n_estimators:1800, max_depth:10, learning_rate:0.08 0.2261 0.66 0.97
n_estimators:1800, max_depth:10, learning_rate:0.09 0.3222 0.66 0.97
n_estimators:1800, max_depth:10, learning_rate:0.1  0.1059 0.66 0.97
n_estimators:1800, max_depth:10, learning_rate:0.2  0.1248 0.67 0.97
n_estimators:1600, max_depth:12, learning_rate:0.1  0.5747 0.69 0.95
n_estimators:1800, max_depth:12, learning_rate:0.1  0.6017 0.69 0.95
n_estimators:1800, max_depth:12, learning_rate:0.2  0.8980 0.70 0.94
n_estimators:1800, max_depth:12, learning_rate:0.08 0.4094 0.69 0.96
n_estimators:1800, max_depth:14, learning_rate:0.08 0.8275 0.70 0.93
48763 0
n_estimators:1800, max_depth:8,  learning_rate:0.1  0.0781 0.62 0.89  
48763 48763

13 5 2.02 1.97 1.88

## Layer 2

In [129]:
random_key = 48763
logger.info("Starting preprocessing in layer 2")
feature_colum = basic_columns
X_train, X_val, Y_train, Y_val = preprocessing(train_dirs, feature_colum, feature_columns_2, val_set_size, random_key)
logger.info("Starting model training in layer 2")
parameters = { "n_estimators" : 800 , "max_depth" : 12, "learning_rate" : 0.3}
random_key = 48763
model, Y_pred, filename, val_mse = runXGBoostRegressor(X_train, X_val, Y_train, Y_val, random_key, model_names[1], parameters)
result_dir = result_dir + "_" + filename
logger.info("Starting prediction on test set in layer 2")
X_pred = test_dataset[feature_colum]
Y_test = pd.DataFrame(model.predict(X_pred), columns=feature_columns_2)
test_dataset = pd.concat([test_dataset, Y_test], axis=1)

INFO:__main__:Starting preprocessing in layer 2
INFO:__main__:Starting model training in layer 2
INFO:__main__:Start running XGBoost model
INFO:__main__:parameters: n_estimators:800, max_depth:12, learning_rate:0.3


訓練集:  0.991381824016571


INFO:__main__:MSE of Validation Set: 4.08333615829359
INFO:__main__:Add file name:XGBoost_4.csv
INFO:__main__:Finish running XGBoost model
INFO:__main__:Starting prediction on test set in layer 2


測試集:  nan


48763 0
n_estimators:800, max_depth:8,  learning_rate:0.1   78.9048 0.9475
n_estimators:800, max_depth:10, learning_rate:0.1   39.3299 0.9701
n_estimators:800, max_depth:10, learning_rate:0.15  25.6399 0.9751
n_estimators:800, max_depth:10, learning_rate:0.2   18.8055 0.9790
n_estimators:800, max_depth:10, learning_rate:0.25  10.8917 0.9811
n_estimators:800, max_depth:10, learning_rate:0.275 12.8694 0.9823
n_estimators:800, max_depth:10, learning_rate:0.275 15      0.982
48763 48763
n_estimators:800, max_depth:12, learning_rate:0.25  4.8156  0.9909
n_estimators:800, max_depth:12, learning_rate:0.3   4.0833  0.9913

n_estimators:600, max_depth:13, learning_rate:0.3 1.48
n_estimators:600, max_depth:13, learning_rate:0.2 1.34
n_estimators:600, max_depth:15, learning_rate:0.2 1.31


## Layer 3

In [130]:
logger.info("Starting preprocessing in layer 3")
random_key = 48763
feature_colum = basic_columns  + feature_columns_2
X_train, X_val, Y_train, Y_val = preprocessing(train_dirs, feature_colum, feature_columns_3, val_set_size, random_key)
logger.info("Starting model training in layer 3")
parameters = { "n_estimators" : 800 , "max_depth" : 12 , "learning_rate" : 0.3}
random_key = 48763
model, Y_pred, filename, val_mse = runXGBoostRegressor(X_train, X_val, Y_train, Y_val, random_key, model_names[2], parameters)
result_dir = result_dir[:-4] + "_" + filename
logger.info("Starting prediction on test set in layer 3")
X_pred = test_dataset[feature_colum]
Y_test = pd.DataFrame(model.predict(X_pred), columns=feature_columns_3)
test_dataset = pd.concat([test_dataset, Y_test], axis=1)

INFO:__main__:Starting preprocessing in layer 3
INFO:__main__:Starting model training in layer 3
INFO:__main__:Start running XGBoost model
INFO:__main__:parameters: n_estimators:800, max_depth:12, learning_rate:0.3
INFO:__main__:MSE of Validation Set: 136360.04440796515
INFO:__main__:Add file name:XGBoost_136360.csv
INFO:__main__:Finish running XGBoost model


訓練集:  0.9986770749092102
測試集:  nan


INFO:__main__:Starting prediction on test set in layer 3


n_estimators:800, max_depth:8,  learning_rate:0.1 848435 0.9464
n_estimators:800, max_depth:8,  learning_rate:0.2 615394 0.9589
n_estimators:800, max_depth:8,  learning_rate:0.3 595565 0.9644
n_estimators:800, max_depth:10, learning_rate:0.3 540262 0.9892
n_estimators:800, max_depth:12, learning_rate:0.3 136360 0.9986

n_estimators:1600, max_depth:11, learning_rate:0.15 32160030
n_estimators:1800, max_depth:11, learning_rate:0.15 32062256
n_estimators:1800, max_depth:11, learning_rate:0.1  32007405
n_estimators:1800, max_depth:11, learning_rate:0.2  34593299
n_estimators:1800, max_depth:12, learning_rate:0.1  30477769
n_estimators:1800, max_depth:13, learning_rate:0.05 30100012
n_estimators:1800, max_depth:14, learning_rate:0.02 29767614
n_estimators:1800, max_depth:14, learning_rate:0.05 29341979
n_estimators:1800, max_depth:14, learning_rate:0.08 29912468
n_estimators:1800, max_depth:14, learning_rate:0.1  30660590
n_estimators:1800, max_depth:14, learning_rate:0.2  32822677
n_estimators:1400, max_depth:12, learning_rate:0.15 31574809

## Layer 4

In [131]:
logger.info("Starting preprocessing in layer 4")
random_key = 48763
feature_colum = basic_columns  + feature_columns_2 + feature_columns_3
X_train, X_val, Y_train, Y_val = preprocessing(train_dirs, feature_colum, target_columns, val_set_size, random_key)
logger.info("Starting model training in layer 4")
parameters = { "n_estimators" : 800 , "max_depth" : 10 , "learning_rate" : 0.25}
random_key = 48763
model, Y_pred, filename, val_mse = runXGBoostRegressor(X_train, X_val, Y_train, Y_val, random_key, model_names[3], parameters)
result_dir = result_dir[:-4] + "_" + filename
logger.info("Starting prediction on test set in layer 4")
X_pred = test_dataset[feature_colum]
Y_test = pd.DataFrame(model.predict(X_pred), columns=["答案"])
Y_test = Y_test.round(2)
test_dataset = pd.concat([test_dataset, Y_test], axis=1)

INFO:__main__:Starting preprocessing in layer 4
INFO:__main__:Starting model training in layer 4
INFO:__main__:Start running XGBoost model
INFO:__main__:parameters: n_estimators:800, max_depth:10, learning_rate:0.25
INFO:__main__:MSE of Validation Set: 1.2827553299903869
INFO:__main__:Add file name:XGBoost_1.csv
INFO:__main__:Finish running XGBoost model
INFO:__main__:Starting prediction on test set in layer 4


訓練集:  0.9993041753768921
測試集:  nan


n_estimators:800, max_depth:8,  learning_rate:0.1  2.6847 0.9928
n_estimators:800, max_depth:8,  learning_rate:0.2  1.8866 0.9953
n_estimators:800, max_depth:8,  learning_rate:0.3  322.81 0.9963
n_estimators:800, max_depth:8,  learning_rate:0.25 0.2991 0.9959
n_estimators:800, max_depth:10, learning_rate:0.25 1.2827 0.9993

n_estimators:1000, max_depth:6, learning_rate:0.1  3548
n_estimators:1800, max_depth:6, learning_rate:0.1  2980
n_estimators:1800, max_depth:6, learning_rate:0.1  2915
n_estimators:1800, max_depth:6, learning_rate:0.12 2820
n_estimators:1800, max_depth:6, learning_rate:0.15 2769
n_estimators:1800, max_depth:6, learning_rate:0.2  2545
n_estimators:1800, max_depth:6, learning_rate:0.3  2553
n_estimators:1800, max_depth:6, learning_rate:0.4  2425
n_estimators:1800, max_depth:6, learning_rate:0.5  2541
n_estimators:1800, max_depth:8, learning_rate:0.3  2010
n_estimators:1800, max_depth:10, learning_rate:0.3 1767
n_estimators:1800, max_depth:12, learning_rate:0.3 2073
n_estimators:1800, max_depth:12, learning_rate:0.1 1611
n_estimators:1800, max_depth:14, learning_rate:0.02 1588

## Save

In [132]:
if isMean:
    combine_test_dataset = test_dataset.groupby(['序號','y', 'mo', 'd', 'h', 'LocationCode'], as_index=False).mean()
    combine_test_dataset['序號'] = combine_test_dataset['序號'].astype('longlong')
    combine_test_dataset['序號'] = pd.Categorical(combine_test_dataset['序號'], categories=sequence, ordered=True)
    combine_test_dataset = combine_test_dataset.sort_values(by='序號', ascending=True).reset_index(drop=True)
    combine_test_dataset['答案'] = combine_test_dataset['答案'].apply(lambda x: max(x, 0))
    combine_test_dataset["答案"] = combine_test_dataset["答案"].round(2)
    result = pd.concat([combine_test_dataset["序號"], combine_test_dataset["答案"]], axis=1)
else:
    result = pd.concat([test_dataset["序號"], test_dataset["答案"]], axis=1)


In [133]:
logger.info(f"Saving new results to {result_dir}")
result.to_csv(result_dir, index=False)

INFO:__main__:Saving new results to submit/indirect4_Average_XGBoost_4_XGBoost_136360_XGBoost_1.csv


In [134]:
comparedir1 = "submit/713968.02.csv"
comparedir2 = "submit/732292.96.csv"
comparedir3 = "submit/735867.9.csv"
comparedir4 = "submit/802591.63.csv"
comparedir5 = "submit/833177.17.csv"
comparedir6 = "submit/895705.41.csv"
comparedir7 = "submit/919745.63.csv"
comparedir8 = "submit/963002.63.csv"
testdir = result_dir

compare_set1 = pd.read_csv(comparedir1)
compare_set2 = pd.read_csv(comparedir2)
compare_set3 = pd.read_csv(comparedir3)
compare_set4 = pd.read_csv(comparedir4)
compare_set5 = pd.read_csv(comparedir5)
compare_set6 = pd.read_csv(comparedir6)
compare_set7 = pd.read_csv(comparedir7)
test = pd.read_csv(testdir)

print("數量")
print(compare_set1["答案"].sum())
print(compare_set2["答案"].sum())
print(compare_set3["答案"].sum())
print(compare_set4["答案"].sum())
print(compare_set5["答案"].sum())
print(compare_set6["答案"].sum())
print(compare_set7["答案"].sum())
print(test["答案"].sum())

print("比較")
print((test["答案"]-compare_set1["答案"]).abs().sum())
print((test["答案"]-compare_set2["答案"]).abs().sum())
print((test["答案"]-compare_set3["答案"]).abs().sum())
print((test["答案"]-compare_set4["答案"]).abs().sum())
print((test["答案"]-compare_set5["答案"]).abs().sum())
print((test["答案"]-compare_set6["答案"]).abs().sum())
print((test["答案"]-compare_set7["答案"]).abs().sum())


數量
3745589.49
3832750.0
3695934.62
3891905.7600000002
3597195.47
2911148.850438615
2930931.18
3645909.96
比較
1257988.27
1276564.7200000002
1252931.12
1454959.06
1373820.15
1429291.489713179
1528358.18


indirect4_noAverage_XGBoost_0_XGBoost_1_XGBoost_4807805_XGBoost_1853.csv
random_key = 48763
parameters = { "n_estimators" : 700 , "max_depth" : 6 , "learning_rate" : 0.1}
random_key = 48763

random_key = 48763
parameters = { "n_estimators" : 600 , "max_depth" : 15 , "learning_rate" : 0.2}
random_key = 0

random_key = 48763
parameters = { "n_estimators" : 1800 , "max_depth" : 14 , "learning_rate" : 0.05}
random_key = 48763

random_key = 48763
parameters = { "n_estimators" : 1800 , "max_depth" : 14 , "learning_rate" : 0.05}
random_key = 48763