In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from datetime import datetime

from scipy import interp
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
import catboost as cb


from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer, make_column_transformer


# Load Data
df_afsnt = pd.read_csv("df_afsnt_total_ver6.csv", encoding = "cp949")
# 시계열 CV를 위한 정렬 
df_afsnt = df_afsnt.sort_values(['year', 'month', 'day'], ascending=[True, True, True])
df_afsnt = df_afsnt.reset_index()
df_afsnt.drop('index', axis = 1, inplace = True)

df_afsnt["is_arrive"] = np.where(df_afsnt["is_arrive"] == "A", True, False)
df_afsnt["is_regular"] = np.where(df_afsnt["is_regular"] == "Y", True, False)
df_afsnt["is_delay"] = np.where(df_afsnt["is_delay"] == "Y", True, False)
df_afsnt["origin_dest"] = df_afsnt['origin'] + "_" + df_afsnt['dest']

# 표준화용 변수들
scale_features = ["degree_center_weight_origin", "degree_center_weight_dest", "degree_center_origin", "degree_center_dest", \
                  "distance_km", "distance_center", "distance_center_weight", "lot_area", "main_air", \
                  "sub_air", "air_processing", "terminal_width", "terminal_ability"]

# OneHotEncoding Features
ohe_features = ['wday', 'airline', 'origin_dest', 'time_discrete']

preprocess = make_column_transformer(
    (StandardScaler(), scale_features),
    (OneHotEncoder(sparse = False, handle_unknown ='ignore'), ohe_features), remainder = "passthrough"
)

# CV Class
class BigconTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
        
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y = None, groups = None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)
        
        # fold마다 margin을 달리 잡는다.
        margin = 1050*77
        start = 0
        stop = 0 
        for i in range(self.n_splits):
            stop = (i + 1) *  k_fold_size
            mid = stop - 1050*15
            yield indices[start:mid - margin], indices[mid: stop]
            
random_state = 8282
bts_cv = BigconTimeSeriesSplit(n_splits=5)

# 결항데이터 제거용
# y_cancel = pd.Series(np.where(df_afsnt_total['is_cancel'] == True, True, False))
# idx_cancel = y_cancel[y_cancel == True].index

df_afsnt.drop(['month','day','flight','origin','dest','cause_cancel','tailnum','cause_delay','sched_time','real_time',
        'is_cancel', 'date','sched_datetime','real_datetime','delay', 'link', 'cnt_per_day', 'is_holiday'], axis=1, inplace=True)

y = df_afsnt['is_delay']
X = df_afsnt.iloc[:, df_afsnt.columns != 'is_delay']

In [None]:
clf_lgb = lgb.LGBMClassifier(**{'colsample_by_tree': 0.7629052365922411, 'learning_rate': 0.05271462978674054, 
                              'max_depth': 13, 'n_estimators': 1400, 'num_leaves': 70, 'reg_lambda': 0.5709993092989195})
pipe_lgb = Pipeline([
    ('preprocess', preprocess), 
    ('clf', clf_lgb)
    ])

print("모델을 훈련합니다.")
tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

for fold_num, (train_idx, val_idx) in enumerate(bts_cv.split(X, y), 1):
        # Fold의 피팅 시작시간 
        start_time = datetime.now()
        
        X_train, y_train = X.iloc[train_idx, :], y.loc[train_idx]
        X_val, y_val = X.iloc[val_idx, :], y.loc[val_idx]
        
        # 훈련
        probas_ = pd.DataFrame({'idx_val': val_idx, 'prob': pipe_lgb.fit(X_train, y_train).predict_proba(X_val)[:, 1]})               
        
#         # 종속변수의 결항데이터 제거 
#         val_idx = list(np.setdiff1d(val_idx, idx_cancel))
#         probas_ = probas_[probas_.isin(val_idx)['idx_val']]
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = roc_curve(y_val, probas_.iloc[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        plt.plot(fpr, tpr, lw=1, alpha=0.3,
                 label='ROC fold %d (AUC = %0.2f)' % (fold_num, roc_auc))
        
        # Fold의 피팅 소요시간 
        time_elapsed = datetime.now() - start_time
        print('{0} Fold fitting time (hh:mm:ss.ms) {1}'.format(fold_num, time_elapsed))


plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Chance', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()