In [None]:
import pandas as pd
import numpy  as np
import math

from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Lasso

from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import matplotlib.patches as patches
import itertools

import time

import warnings
warnings.filterwarnings('ignore')


In [None]:
#Detection Method - AL Framework (Basic)
#AL framework 구현한 함수 - timedelta=hours
def alternating_learners_original_hours(data, feature, target, l_start, l_end, v_limit, batch_size=1, w=10, tau=1, delta=0.2, n0=5):
 
    cd_date = [] #CD로 판단된 날짜에 대해 저장/return할 list
    q = [] #모델 교체 여부를 판단해 저장할 queue
    err_list = [] #error 추이를 확인하기 위한 list(AL framework상에서는 사용 X)
    while l_end+timedelta(hours=batch_size) < v_limit:
        try:
            #Long-time window(LW)
            df_l = data[(data.index>l_start)&(data.index<=l_end)]
            x_l = df_l[feature]
            y_l = df_l[target]
            
            #Short-time window(SW)
            df_s = data[(data.index>l_end-timedelta(hours=batch_size*w))&(data.index<=l_end)]
            x_s = df_s[feature]
            y_s = df_s[target]
            
            #New batch for measuring error(N)
            df_n = data[(data.index>l_end)&(data.index<=l_end+timedelta(hours=batch_size))]
            x_n = df_n[feature]
            y_n = df_n[target]
    
            #LASSO model for LW, SW
            model_l = LinearRegression().fit(x_l, y_l)   
            model_s = LinearRegression().fit(x_s, y_s)
            
            #LW를 학습한 모델 L과 SW를 학습한 모델 S으로 N에 대한 error 측정
            pred_l = model_l.predict(x_n)
            pred_s = model_s.predict(x_n)
            err_l = mean_absolute_percentage_error(y_n, pred_l)
            err_s = mean_absolute_percentage_error(y_n, pred_s)
            
            #AL framework
            if err_l < tau: #err_l이 허용 범위보다 큰 값인 경우
                cd_type = 0
            else:
                if err_l <= err_s: #err_l이 err_s보다 작은 경우, SW로는 LW를 대체하기 어려움 -> 모델 교체가 불가능하므로 q에 0 추가
                    cd_type = 0
                else: #err_l이 err_s보다 큰 경우, SW가 LW보다 새로운 concept에 알맞은 데이터 -> 모델 교체가 필요하므로 q에 1 추가
                    print(l_end, round(err_l,4))
                    cd_type = 1
            q.append(cd_type)
            if len(q)>w:
                q=q[1:]
            if len(q) >= n0 and (sum(q)/len(q)) >= delta: #충분한 개수의 batch를 확인하고, 그 중 모델 교체가 필요한 경우가 일정 비율 이상인 경우,
                err_list.append([l_end, err_s, err_l, err_s, cd_type, len(q), sum(q)/len(q),'yes'])
                print(l_start, l_end, q)
                q = []  #q 초기화
                cd_date.append(l_end) #cd_date에 현재 LW의 종료 날짜 추가
                l_start = l_end - timedelta(hours=batch_size*w) #LW를 SW로 대체
            else: #LW 확장만 수행하면 되는 경우
                err_list.append([l_end, err_l, err_l, err_s, cd_type, len(q), sum(q)/len(q),'no'])
            l_end = l_end+timedelta(hours=batch_size) #LW, SW, N 모두 batch size 하나만큼 이동
        except:  #주어진 데이터에서 더 이상 N으로 이동이 불가능한 경우 or 새로운 SW 또는 N에 해당하는 날짜가 결측되어 error 계산이 불가능한 경우, 날짜 이동만 수행
            l_end = l_end+timedelta(hours=batch_size)
    err_df = pd.DataFrame(err_list, columns=['TIME','error','err_l','err_s','cd','q_length','q_ratio','replacement'])
    err_df.set_index('TIME',inplace=True)
    return cd_date, err_df


In [None]:
# MAPE 계산함수
def mean_absolute_percentage_error(y_test, y_pred):
    return np.mean(np.abs((np.array(y_test) - np.array(y_pred))/np.array(y_test)))*100


In [None]:
data = pd.read_csv('df_posco_ver3.csv', index_col='A')
data.index = pd.to_datetime(data.index)
target = 'target'

In [None]:
posco_features_ver3 = ['C', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'P', 'Z', 'AB', 'CU', 'CW', 'CX', 'CY', 'CZ', 'DA', 'DB', 'DC', 'DF', 'DG', 'DH', 'DI', 'DJ', 'DM', 'DN', 'DO', 'DQ', 'DR', 'DW']

In [None]:
# 기존 모델 성능
from sklearn.linear_model import Lasso, LinearRegression

test_cut = pd.to_datetime('2023-01-01')
features = posco_features_ver3
df_fit = data[data.index<=test_cut]
train_x = df_fit[features]
train_y = df_fit[target]

model = LinearRegression().fit(train_x, train_y)

df_val = data[data.index>test_cut]
test_x = df_val[features]

total_predict = pd.DataFrame(df_val[target])
total_predict.columns = ['True y']
total_predict['LA_pre'] = model.predict(test_x)

score = (np.round(mean_absolute_percentage_error(total_predict['True y'],total_predict['LA_pre']),4))
std = total_predict['True y'].std()


In [None]:
total_predict

In [None]:
score

In [None]:
#기존 모델 target/predict 시각화
plt.figure(figsize=(20,5))
plt.plot(total_predict['True y'], color='black', label='Target')
plt.plot(total_predict['LA_pre'], color='blue', label='Predicted')

plt.title('Target and Predict Value Distribution')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()
plt.tight_layout()
plt.savefig('Target and Prediction.jpg')

In [None]:
# total_predict dataframe에 MAPE 칼럼 추가
total_predict['MAPE'] = total_predict.apply(lambda row: mean_absolute_percentage_error(row['True y'], row['LA_pre']), axis=1)
total_predict

In [None]:
#MAPE Distribution Visualization
plt.figure(figsize=(20,5))
plt.plot(total_predict['MAPE'], color='black')
plt.axhline(total_predict['MAPE'].mean(), color='red', label='Mean')
plt.title('MAPE Distribution')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()
plt.tight_layout()
plt.savefig('MAPE.jpg')

In [None]:
#변수 선택 및 이산화 알고리즘 적용 후 선택된 최적 변수 조합 목록
# n_m_m: posco_features_ver3에 포함된 기존 변수 조합을 고려하지 않고 선택한 변수 조합
# p30_m_m: posco_features_ver3에 포함된 기존 변수 조합을 모두 고려하여 선택한 변수 조합
# p11_m_m: posco_features_ver3에 포함된 기존 변수들 중 11개의 필수 포함 변수만을 고려하여 선택한 변수 조합
n_m_m = ['CX', 'G', 'E', 'H', 'AA', 'AG', 'O', 'I', 'AQ', 'N', 'C', 'AI', 'D', 'W', 'Q', 'F', 'DR', 'T', 'AN', 'DJ', 'R', 'DM', 'AO', 'DW', 'S', 'AC', 'U', 'J', 'AB', 'DK', 'Z', 'DE', 'DD', 'AJ', 'AE', 'AD', 'DQ', 'AK', 'Y', 'DO', 'V', 'AH', 'L', 'CZ', 'DA', 'X', 'AP', 'CY', 'DI', 'M']
p30_m_m = ['C', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'P', 'Z', 'AB', 'CU', 'CW', 'CX', 'CY', 'CZ', 'DA', 'DB', 'DC', 'DF', 'DG', 'DH', 'DI', 'DJ', 'DM', 'DN', 'DO', 'DQ', 'DR', 'DW', 'CJ', 'DL', 'DP', 'DS', 'DT', 'DU', 'DV', 'E', 'AG', 'F', 'O', 'AA', 'AQ', 'D', 'AI', 'R', 'U', 'N', 'W', 'AJ']
p11_m_m = ['P', 'L', 'DW', 'G', 'K', 'J', 'M', 'I', 'C', 'H', 'DR', 'CJ', 'DF', 'DL', 'DP', 'DS', 'DT', 'DU', 'DV', 'AG', 'AN', 'AA', 'N', 'O', 'AC', 'AI', 'AQ', 'E', 'W', 'D', 'Q', 'DJ', 'R', 'F', 'DM', 'U', 'DK', 'T', 'S', 'AO', 'DD', 'Z', 'AB', 'DE', 'AJ', 'DQ', 'AE', 'AD', 'DO', 'AK']

In [None]:
#변수 별 성능 비교
score_onlyfs_total = []

f = 0
for features in [posco_features_ver3, n_m_m, p30_m_m, p11_m_m]: 
    for i in range(2, len(features)+1): #변수 2개~전체 변수 성능 비교
        df_fit = data[data.index<=test_cut]
        train_x = df_fit[features[:i]]
        train_y = df_fit['target']
        
        model = LinearRegression().fit(train_x, train_y)
        
        df_val = data[data.index>test_cut]
        test_x = df_val[features[:i]]
        
        total_predict2 = pd.DataFrame(df_val['target'])
        total_predict2.columns = ['True y']
        
        total_predict2['LA_pre'] = model.predict(test_x)
        mape_fs = (np.round(mean_absolute_percentage_error(total_predict2['True y'],total_predict2['LA_pre']),4))
        score_onlyfs_total.append([f,i,mape_fs])
    f+=1

In [None]:
score_onlyfs_total=pd.DataFrame(score_onlyfs_total, columns=['Feature_Num', 'i', 'mape_fs']) 
feature_mapping = {0: 'posco_features_ver3', 1: 'n_m_m', 2: 'p30_m_m', 3: 'p11_m_m'}
score_onlyfs_total['Feature_Num'] = score_onlyfs_total['Feature_Num'].map(feature_mapping)
score_onlyfs_total

In [None]:
idx_max_mape_fs = score_onlyfs_total.groupby('Feature_Num')['mape_fs'].idxmin()
best_mape_fs_rows = score_onlyfs_total.loc[idx_max_mape_fs]
best_mape_fs_rows #최고 성능: n_m_m[:39], p11_m_m[:21], p30_m_m[:38]까지만 선택한 feature subset

In [None]:
#posco변수 사용한 AL
param_opt_posco_v3 = []
feature = posco_features_ver3

#최적 파라미터 조합
batch_size, w, tau, delta, n0 = 3, 5, 2, 0.2, 2
cd_posco_v3, err_df_posco_v3 = alternating_learners_original_hours(data, feature, target, pd.to_datetime('2023-01-01'), pd.to_datetime('2023-01-11'), pd.to_datetime('2023-07-01'), batch_size, w, tau, delta, n0)
param_opt_posco_v3.append([feature, batch_size, w, tau, delta, n0, cd_posco_v3, err_df_posco_v3])

In [None]:
param_opt_posco_v3=pd.DataFrame(param_opt_posco_v3, columns=['feature', 'batch_size', 'w', 'tau', 'delta', 'n0', 'cd_posco_v3', 'err_df_posco_v3']) 
param_opt_posco_v3

In [None]:
plt.figure(figsize=(20,5))
plt.plot(total_predict['MAPE'], color='black', label='MAPE')
plt.axhline(total_predict['MAPE'].mean(), color='red', label='Mean MAPE')

for i, timestamp in enumerate(cd_posco_v3):
    plt.axvline(x=timestamp, color='gray', linestyle='--', linewidth=0.8, label='cd' if i == 0 else '')
plt.legend(['MAPE', 'Mean MAPE', 'cd'])

plt.title('Posco Feature Version CD Detection Result')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()

plt.tight_layout()

In [None]:
plt.figure(figsize=(20,5))
plt.plot(total_predict['True y'], color='black', label='Target')
plt.plot(total_predict['LA_pre'], color='blue', label='Predicted')

for i, timestamp in enumerate(cd_posco_v3):
    plt.axvline(x=timestamp, color='gray', linestyle='--', linewidth=0.8, label='cd' if i == 0 else '')
plt.legend(['MAPE', 'Mean MAPE', 'cd'])


plt.title('Posco Feature Version CD Detection Result')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()

plt.tight_layout()

In [None]:
#n_m_m 변수 조합 중 39개의 변수를 사용한 AL
param_opt_n_m_m = []
feature = n_m_m[:39] 

batch_size, w, tau, delta, n0 = 3, 5, 2, 0.2, 2
cd_n_m_m, err_df_n_m_m = alternating_learners_original_hours(data, feature, target, pd.to_datetime('2023-01-01'), pd.to_datetime('2023-01-11'), pd.to_datetime('2023-07-01'), batch_size, w, tau, delta, n0)
param_opt_n_m_m.append([feature, batch_size, w, tau, delta, n0, cd_n_m_m, err_df_n_m_m])

In [None]:
param_opt_n_m_m=pd.DataFrame(param_opt_posco_v3, columns=['feature', 'batch_size', 'w', 'tau', 'delta', 'n0', 'cd_n_m_m', 'err_df_n_m_m']) 
param_opt_n_m_m

In [None]:
plt.figure(figsize=(20,5))
plt.plot(total_predict['MAPE'], color='black', label='MAPE')
plt.axhline(total_predict['MAPE'].mean(), color='red', label='Mean MAPE')

for i, timestamp in enumerate(cd_n_m_m):
    plt.axvline(x=timestamp, color='gray', linestyle='--', linewidth=0.8, label='cd' if i == 0 else '')
plt.legend(['MAPE', 'Mean MAPE', 'cd'])


plt.title('n_m_m Combination Version CD Detection Result')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()

plt.tight_layout()

In [None]:
plt.figure(figsize=(20,5))
plt.plot(total_predict['True y'], color='black', label='Target')
plt.plot(total_predict['LA_pre'], color='blue', label='Predicted')

for i, timestamp in enumerate(cd_n_m_m):
    plt.axvline(x=timestamp, color='gray', linestyle='--', linewidth=0.8, label='cd' if i == 0 else '')
plt.legend(['MAPE', 'Mean MAPE', 'cd'])


plt.title('n_m_m Combination Version CD Detection Result')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()

plt.tight_layout()

In [None]:
plt.figure(figsize=(20,5))
plt.plot(total_predict['True y'], color='black', label='Target')
plt.plot(total_predict['LA_pre'], color='blue', label='Predicted')

for i, timestamp in enumerate(cd_posco_v3):
    plt.axvline(x=timestamp, color='red', linestyle='--', linewidth=0.8, label='cd of posco feature' if i == 0 else '')
plt.legend(['MAPE', 'Mean MAPE', 'cd'])

for i, timestamp in enumerate(cd_n_m_m):
    plt.axvline(x=timestamp, color='green', linestyle='--', linewidth=0.8, label='cd of n_m_m feature' if i == 0 else '')
plt.legend(['MAPE', 'Mean MAPE', 'cd'])

plt.title('n_m_m Combination Version CD Detection Result')
plt.ylabel('MAPE')
plt.xlabel('Test Timestamp')
plt.legend()

plt.tight_layout()

In [None]:
# n_m_m[:39] 조합으로 학습했을 때 예측 성능
from sklearn.linear_model import Lasso, LinearRegression

test_cut = pd.to_datetime('2023-01-01')
features = n_m_m[:39]
df_fit = data[data.index<=test_cut]
train_x = df_fit[features]
train_y = df_fit[target]

model = LinearRegression().fit(train_x, train_y)

df_val = data[data.index>test_cut]
test_x = df_val[features]

total_predict_n_m_m = pd.DataFrame(df_val[target])
total_predict_n_m_m.columns = ['True y']
total_predict_n_m_m['n_m_m_LA_pre'] = model.predict(test_x)

score = (np.round(mean_absolute_percentage_error(total_predict_n_m_m['True y'],total_predict_n_m_m['n_m_m_LA_pre']),4))
std = total_predict_n_m_m['True y'].std()


In [None]:
n_m_m_pre=total_predict_n_m_m.iloc[:,1]
n_m_m_pre=pd.DataFrame(n_m_m_pre)
n_m_m_pre

In [None]:
total_predict

In [None]:
predict = pd.merge(total_predict, n_m_m_pre, on='A', how='inner')

In [None]:
predict=predict.drop(columns=['MAPE'])

In [None]:
predict

In [None]:
# mape comparison visualization
x = predict.index
y_true = predict['True y']
y_pred_p = predict['LA_pre']
y_pred_mm = predict['n_m_m_LA_pre']


plt.figure(figsize=(30, 12))
plt.plot(x, y_true, label='True Y', color='black')
plt.plot(x, y_pred_p, label='기존 전략', color='blue')
plt.plot(x, y_pred_mm, label='최적 변수 조합 적용(AL)', color='green')


import matplotlib.font_manager as fm
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'AppleGothic'

plt.xlabel('TIME', fontsize=30)
#plt.ylabel('', fontsize=32)

plt.legend(fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)

plt.legend(fontsize=22)
plt.tight_layout()
