# **Import pacakges**

In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from datetime import datetime, timedelta
import time 

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, f1_score

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.svm import SVR
from scipy import stats
from sklearn.decomposition import PCA

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

import warnings
warnings.filterwarnings(action='ignore')

# **Data**

In [2]:
train = pd.read_csv("../DAT/train.csv")
test = pd.read_csv("../DAT/test.csv")

train.drop(columns = ["ID"], inplace = True)
test.drop(columns = ["ID"], inplace = True)
print(train.shape, test.shape)

(14095, 53) (6041, 18)


In [7]:
train["COMPONENT_ARBITRARY"] = LabelEncoder().fit_transform(train["COMPONENT_ARBITRARY"])
test["COMPONENT_ARBITRARY"] = LabelEncoder().fit_transform(test["COMPONENT_ARBITRARY"])

## **_변수 종류에 따라 분류**
- cat_col: 범주형 변수
- te_col: 테스트 데이터에 있는 변수
- tr_col: 훈련 데이터에"만" 있는 변수

In [9]:
cat_col = ["COMPONENT_ARBITRARY"]
te_col = list(set(test.columns) - set(cat_col))
tr_col = list(set(train.columns) - set(te_col) - set(cat_col))
print(len(tr_col), len(te_col), len(cat_col))

35 17 1


## **_결측치 처리**
- 20%이하는 0으로 대체
- 20%초과는 제거
- use_tr_col: 결측치 제거 후 사용할 변수

In [10]:
null_20 = (train[tr_col].isnull().sum() <= len(train)*0.20)
use_tr_col = sorted(list(null_20[null_20 == True].index))

In [11]:
use_train = train[use_tr_col + te_col + cat_col].fillna(0)
print(use_train.isnull().sum().sum())
use_train.head(3)

0


Unnamed: 0,AL,B,BA,BE,CA,CD,K,LI,MG,NA,...,ZN,CU,NI,V,H2O,YEAR,MO,ANONYMOUS_2,V40,COMPONENT_ARBITRARY
0,3,93,0,0,3059,0.0,27.0,1,14,16,...,75,78,6,0,0.0,2011,1,200,154.0,2
1,2,19,0,0,2978,0.0,0.0,0,1,1,...,652,31,0,0,0.0,2021,0,375,44.0,1
2,110,1,1,0,17,0.0,0.0,0,0,2,...,412,2,0,0,0.0,2015,0,200,72.6,1


## **_변수 종류에 따라 분류**

In [25]:
target = ["Y_LABEL"]

### Train에만 있는 변수
use_tr_col = sorted(list(set(use_train.columns) - set(te_col)))
use_tr_col_ = list(set(use_tr_col) - set(target))

### 모든 변수
use_col = use_tr_col + te_col
use_col_ = sorted(list(set(use_col) - set(target)))

# **Validation**

In [12]:
def MAPE(true, pred) :
    import numpy as np
    mape = np.mean(np.abs((np.array(true) - np.array(pred)) / (np.array(true)+0.000000001))) * 100
    return np.round(mape, 3)

## **_Train/Valid split**

In [13]:
train_X, valid_X, train_y, valid_y = train_test_split(use_train.drop(columns = ["Y_LABEL"]), use_train["Y_LABEL"], 
                                                      test_size = 0.3, shuffle = True, random_state = 717, stratify = use_train["Y_LABEL"])

train_X = train_X.reset_index(drop = True)
train_y = train_y.reset_index(drop = True)
valid_X = valid_X.reset_index(drop = True)
valid_y = valid_y.reset_index(drop = True)

print(train_X.shape, valid_X.shape, train_y.shape, valid_y.shape)

(9866, 35) (4229, 35) (9866,) (4229,)


## **_범주형 변수 처리**

In [14]:
### One Hot Encodng
tr_cat_oh = pd.get_dummies(train_X[cat_col].astype(str))
vd_cat_oh = pd.get_dummies(valid_X[cat_col].astype(str))

### Label Encodng
tr_cat_lb = pd.DataFrame(LabelEncoder().fit_transform(train_X[cat_col]))
vd_cat_lb = pd.DataFrame(LabelEncoder().fit_transform(valid_X[cat_col]))

### One Hot & Label Encoding
tr_cat_ol = pd.concat([tr_cat_lb, tr_cat_oh], axis = 1)
vd_cat_ol = pd.concat([vd_cat_lb, vd_cat_oh], axis = 1)

## **_Train에만 있는 변수 생성**

- Test에서 사용 가능한 변수로 Train에 존재하는 변수 생성
- MAPE 기준 가장 좋았던 "S" 변수만 생성

In [15]:
def make_col(col, train_X, valid_X_, tr_cat, vd_cat, training= True) :
    make_ = pd.DataFrame()
    col_ = valid_X_.columns
    print(col_)
    std = StandardScaler().fit(train_X[col_])
    st_train_X = pd.concat([pd.DataFrame(std.transform(train_X[col_]), columns = col_), tr_cat], axis = 1)
    st_valid_X = pd.concat([pd.DataFrame(std.transform(valid_X_[col_]), columns = col_), vd_cat], axis = 1)
    
    for col__ in tqdm(col) : 
        target_ = col__
        model = ExtraTreesRegressor(random_state = 717, n_estimators = 200).fit(st_train_X, train_X[target_])
        pred = model.predict(st_valid_X)    
        make_[target_] = pred
        
        if training == True :
            print("ET: ", target_, MAPE(valid_X[target_], pred))

    return make_    

In [16]:
# valid_X_ : test에만 있는 변수만으로 구성
valid_X_ = valid_X[te_col]

# "S" 변수 추가
valid_X_["S"] = make_col(["S"], train_X = train_X, valid_X_ = valid_X_, tr_cat = tr_cat_oh, vd_cat = vd_cat_oh)
print(valid_X_.shape)

Index(['PQINDEX', 'MN', 'CO', 'CR', 'TI', 'FE', 'AG', 'ANONYMOUS_1', 'ZN',
       'CU', 'NI', 'V', 'H2O', 'YEAR', 'MO', 'ANONYMOUS_2', 'V40'],
      dtype='object')


  0%|          | 0/1 [00:00<?, ?it/s]

ET:  S 21.89
(4229, 18)


## **_파생변수 생성**

### **__PCA 변수 생성**

- train에만 있는 변수 정보를 PCA로 축약하여 이를 test에만 있는 변수 정보로 생성

In [19]:
### 표준화
st_train_X = pd.DataFrame(StandardScaler().fit_transform(train_X), columns = train_X.columns)
st_valid_X_ = StandardScaler().fit_transform(valid_X_[te_col])

### Train에만 있는 변수를 PCA
pca = PCA(n_components = 7, random_state = 717).fit(st_train_X[use_tr_col_])
pca_fit = pca.transform(st_train_X[use_tr_col_])

train_X["pca1"] = pca_fit[:,0]
train_X["pca2"] = pca_fit[:,1]
train_X["pca3"] = pca_fit[:,2]


### Test에만 있는 변수로 위 PCA 값을 생성
pca1_model = ExtraTreesRegressor(random_state = 717, n_estimators = 100).fit(st_train_X[te_col], pca_fit[:,0])
pca2_model = ExtraTreesRegressor(random_state = 717, n_estimators = 100).fit(st_train_X[te_col], pca_fit[:,1])
pca3_model = ExtraTreesRegressor(random_state = 717, n_estimators = 100).fit(st_train_X[te_col], pca_fit[:,2])
    
valid_X_["pca1"] = pca1_model.predict(st_valid_X_)
valid_X_["pca2"] = pca2_model.predict(st_valid_X_)
valid_X_["pca3"] = pca3_model.predict(st_valid_X_)

In [20]:
print(pca.explained_variance_ratio_)
print(MAPE(pca_fit[:,0], pca1_model.predict(st_train_X[te_col])))
print(MAPE(pca_fit[:,1], pca2_model.predict(st_train_X[te_col])))
print(MAPE(pca_fit[:,2], pca3_model.predict(st_train_X[te_col])))

[0.10524854 0.09831316 0.07932496 0.0708676  0.06734925 0.05868314
 0.05676373]
0.0
0.0
0.0


### **__기타 파생변수 생성**

In [21]:
train_X["total_ppm"] = train_X[["AG", "CO", "CR", "CU", "FE", "MN", "MO", "NI", "TI", "V", "ZN", "H2O"]].sum(axis = 1)
valid_X_["total_ppm"] = valid_X_[["AG", "CO", "CR", "CU", "FE", "MN", "MO", "NI", "TI", "V", "ZN", "H2O"]].sum(axis = 1)

train_X["ANO1"] = train_X["ANONYMOUS_1"] / train_X["ANONYMOUS_2"]
valid_X_["ANO1"] = valid_X_["ANONYMOUS_1"] / valid_X_["ANONYMOUS_2"]

## **_모델 생성**
- Knowledge Deistillation 개념 사용
- 상관관계에 따른 변수 재배열
- 파생 변수 사용
- 앙상블 사용
- 최적의 Threshold 

### **__범주형 Y를 확률값으로 변경**

In [27]:
st_train_X = pd.concat([pd.DataFrame(StandardScaler().fit_transform(train_X[use_col_]), columns = use_col_), tr_cat_oh], axis = 1)

model_proba = RandomForestClassifier(random_state = 717, n_estimators = 100).fit(st_train_X[use_col_], train_y)
pred_proba = model_proba.predict_proba(st_train_X[use_col_])[:,1]

print(np.round(f1_score(train_y, np.round(pred_proba, 0).astype(int), average = "macro"), 5)) 

1.0


### **__모델에 사용할 변수 정의**

In [33]:
### 파생 변수 중 사용할 변수
add_col = ["S", "total_ppm"]
print("사용할 추가 변수: ", add_col)


### 범주형 변수 유형 선택(OH or LB)
tr_cat = tr_cat_oh
vd_cat = vd_cat_oh

### 표준화: train_x와 valid_x의 변수가 다름을 주의
st_train_X = pd.concat([pd.DataFrame(StandardScaler().fit_transform(train_X), columns = train_X.columns), tr_cat], axis = 1)    
std = StandardScaler().fit(train_X[te_col + add_col])
st_valid_X = pd.concat([pd.DataFrame(std.transform(valid_X_[te_col + add_col]), columns = te_col + add_col), vd_cat], axis = 1)

사용할 추가 변수:  ['S', 'total_ppm']


### **__Fit and Predict**

In [34]:
# 변수 순서를 pred_proba와 상관관계가 높은 순으로 배열
corr_df = st_train_X.loc[:,te_col + add_col]
corr_df["proba"] = pred_proba
corr_idx = list(np.abs(corr_df.corr().loc["proba", :]).sort_values(ascending = False)[1:].index)
corr_idx += list(vd_cat.columns)
print("변수 순서:", corr_idx)

변수 순서: ['YEAR', 'FE', 'NI', 'ANONYMOUS_1', 'CR', 'V', 'total_ppm', 'MN', 'TI', 'PQINDEX', 'ANONYMOUS_2', 'S', 'CU', 'AG', 'V40', 'ZN', 'CO', 'MO', 'H2O', 'COMPONENT_ARBITRARY_0', 'COMPONENT_ARBITRARY_1', 'COMPONENT_ARBITRARY_2', 'COMPONENT_ARBITRARY_3']


In [35]:
SEED = 717
model_dict = {}
model_dict["RF"] = RandomForestRegressor(random_state = SEED)
model_dict["ET"] = ExtraTreesRegressor(random_state = SEED)
model_dict["LGBM"] = LGBMRegressor(random_state = SEED)
model_dict["XGB"] = XGBRegressor(random_state = SEED)
model_dict["GB"] = GradientBoostingRegressor(random_state = SEED)
model_dict["CAT"] = CatBoostRegressor(random_state = SEED, silent = True)

In [38]:
pred

array([0.0844, 0.0443, 0.1586, ..., 0.0864, 0.0819, 0.0541])

In [72]:
pred_dict = {}
for name_, model_ in tqdm(model_dict.items()):
    start_time = time.time()
    model_fit = model_.fit(st_train_X[corr_idx], pred_proba)
    pred = model_fit.predict(st_valid_X[corr_idx])
    pred_dict[name_] = pred
    print("%s Done!| 소요시간: %.4fsec"% (name_, time.time()-start_time))

  0%|          | 0/6 [00:00<?, ?it/s]

RF Done!| 소요시간: 15.5529sec
ET Done!| 소요시간: 4.0741sec
LGBM Done!| 소요시간: 0.1835sec
XGB Done!| 소요시간: 0.7709sec
GB Done!| 소요시간: 2.6350sec
CAT Done!| 소요시간: 5.0036sec


### **_Ensemble result**

In [73]:
def ensemble_result(pred_dict, true_y = valid_y, ascending = True):
    import itertools

    ensemble_dict = {}
    arr_name = pred_dict.keys()
    arr_value = pred_dict.values()

    for i in range(1, len(arr_name)+1):
        ncr_name = list(itertools.combinations(arr_name, i))
        ncr_value = list(itertools.combinations(arr_value, i))
        for j in range(len(ncr_name)):
            pred_ens = np.mean(list(ncr_value[j]), axis = 0)       
            ensemble_dict[ncr_name[j]] = list(pred_ens)

    return ensemble_dict

In [74]:
ensemble_dict = ensemble_result(pred_dict)

### **__모델별 최적의 Threshold**

In [75]:
threshold_dict = {}

for th in tqdm(range(0, 1000)) :
    threshold = np.round(th/1000, 3)
  
    for name, pred in ensemble_dict.items() :
        pred_01 = np.select([pred >= threshold], [1], 0)
        threshold_dict[str(name) + str(threshold)] = np.round(f1_score(valid_y, pred_01, average = "macro"), 5)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [76]:
result_df = pd.DataFrame.from_dict(threshold_dict, orient = "index")
result_df.columns = ["F1 Score"]
result_df.sort_values(by = "F1 Score", ascending = False, inplace = True)
result_df.head()

Unnamed: 0,F1 Score
"('ET', 'GB')0.161",0.57171
"('ET', 'GB')0.163",0.57166
"('ET', 'GB')0.177",0.57148
"('ET', 'GB')0.176",0.57139
"('ET', 'GB')0.16",0.57071


# Submission
- Validation 방법 그대로 적용

## **_Test predict**

In [77]:
TR_cat_oh = pd.get_dummies(use_train[cat_col].astype(str))
TE_cat_oh = pd.get_dummies(test[cat_col].astype(str))
TR_cat_oh.columns = ["COMPONENT_ARBITRARY_0", "COMPONENT_ARBITRARY_1", "COMPONENT_ARBITRARY_2", "COMPONENT_ARBITRARY_3"]
TE_cat_oh.columns = ["COMPONENT_ARBITRARY_0", "COMPONENT_ARBITRARY_1", "COMPONENT_ARBITRARY_2", "COMPONENT_ARBITRARY_3"]

In [78]:
use_train_ = use_train.copy()
test_ = test[te_col]
test_["S"] = make_col(["S"], train_X = use_train_, valid_X_ = test_, tr_cat = TR_cat_oh, vd_cat = TE_cat_oh, training = False)
print(test_.shape)

Index(['PQINDEX', 'MN', 'CO', 'CR', 'TI', 'FE', 'AG', 'ANONYMOUS_1', 'ZN',
       'CU', 'NI', 'V', 'H2O', 'YEAR', 'MO', 'ANONYMOUS_2', 'V40'],
      dtype='object')


  0%|          | 0/1 [00:00<?, ?it/s]

(6041, 18)


In [79]:
### 표준화
st_use_train_ = pd.DataFrame(StandardScaler().fit_transform(use_train_), columns = use_train_.columns)
st_test_ = StandardScaler().fit_transform(test_[te_col])

### Train에만 있는 변수를 PCA
Pca = PCA(n_components = 5, random_state = 717).fit(st_use_train_[use_tr_col_])
Pca_fit = Pca.transform(st_use_train_[use_tr_col_])

use_train_["pca1"] = Pca_fit[:,0]
use_train_["pca2"] = Pca_fit[:,1]
use_train_["pca3"] = Pca_fit[:,2]
    

### Test에만 있는 변수로 위 PCA 값을 생성
Pca1_model = ExtraTreesRegressor(random_state = 717).fit(st_use_train_[te_col], Pca_fit[:,0])
Pca2_model = ExtraTreesRegressor(random_state = 717).fit(st_use_train_[te_col], Pca_fit[:,1])
Pca3_model = ExtraTreesRegressor(random_state = 717).fit(st_use_train_[te_col], Pca_fit[:,2])
    
test_["pca1"] = Pca1_model.predict(st_test_)
test_["pca2"] = Pca2_model.predict(st_test_)
test_["pca3"] = Pca3_model.predict(st_test_)

In [80]:
use_train_["total_ppm"] = use_train_[["AG", "CO", "CR", "CU", "FE", "MN", "MO", "NI", "TI", "V", "ZN", "H2O"]].sum(axis = 1)
test_["total_ppm"] = test_[["AG", "CO", "CR", "CU", "FE", "MN", "MO", "NI", "TI", "V", "ZN", "H2O"]].sum(axis = 1)

use_train_["ANO1"] = use_train_["ANONYMOUS_1"] / use_train_["ANONYMOUS_2"]
test_["ANO1"] = test_["ANONYMOUS_1"] / test_["ANONYMOUS_2"]

In [82]:
st_use_train_ = pd.concat([pd.DataFrame(StandardScaler().fit_transform(use_train_), columns = use_train_.columns), TR_cat_oh], axis = 1)

Model_proba = RandomForestClassifier(random_state = 717, n_estimators = 100).fit(st_use_train_[use_col_], use_train_["Y_LABEL"])
Pred_proba = Model_proba.predict_proba(st_use_train_[use_col_])[:,1]

print(np.round(f1_score(use_train_["Y_LABEL"], np.round(Pred_proba, 0).astype(int), average = "macro"), 5)) 

0.99977


In [83]:
### 사용할 파생변수
add_col = ["S", "total_ppm"]

### 표준화
st_use_train_ = pd.concat([pd.DataFrame(StandardScaler().fit_transform(use_train_), columns = use_train_.columns), TR_cat_oh], axis = 1)    
std = StandardScaler().fit(use_train_[te_col + add_col])
st_test_ = pd.concat([pd.DataFrame(std.transform(test_[te_col + add_col]), columns = te_col + add_col), TE_cat_oh], axis = 1)


### 변수 순서를 pred_proba와 상관관계가 높은 순으로 배열
corr_df = st_use_train_.loc[:,te_col + add_col]
corr_df["proba"] = Pred_proba
corr_idx = list(np.abs(corr_df.corr().loc["proba", :]).sort_values(ascending = False)[1:].index)
corr_idx += list(TR_cat_oh.columns)
print("변수 순서:", corr_idx)

변수 순서: ['YEAR', 'FE', 'NI', 'ANONYMOUS_1', 'PQINDEX', 'TI', 'MN', 'ANONYMOUS_2', 'S', 'CU', 'ZN', 'V40', 'V', 'total_ppm', 'AG', 'CR', 'CO', 'MO', 'H2O', 'COMPONENT_ARBITRARY_0', 'COMPONENT_ARBITRARY_1', 'COMPONENT_ARBITRARY_2', 'COMPONENT_ARBITRARY_3']


In [84]:
Pred_dict = {}
for name_, model_ in tqdm(model_dict.items()):
    start_time = time.time()
    model_fit = model_.fit(st_use_train_[corr_idx], Pred_proba)
    Pred = model_fit.predict(st_test_[corr_idx])
    Pred_dict[name_] = Pred
    print("%s Done!| 소요시간: %.4fsec"% (name_, time.time()-start_time))

  0%|          | 0/6 [00:00<?, ?it/s]

RF Done!| 소요시간: 24.1690sec
ET Done!| 소요시간: 6.6353sec
LGBM Done!| 소요시간: 0.2283sec
XGB Done!| 소요시간: 1.0332sec
GB Done!| 소요시간: 4.1011sec
CAT Done!| 소요시간: 10.2315sec


In [85]:
print(result_df.head())

                   F1 Score
('ET', 'GB')0.161   0.57171
('ET', 'GB')0.163   0.57166
('ET', 'GB')0.177   0.57148
('ET', 'GB')0.176   0.57139
('ET', 'GB')0.16    0.57071


In [86]:
Pred = (Pred_dict["ET"] + Pred_dict["GB"]) / 2
Pred = np.select([Pred >= 0.161], [1], 0)

## **_Submission**

In [87]:
sample_submission = pd.read_csv('../OUT/sample_submission.csv')

In [88]:
sample_submission['Y_LABEL'] = Pred
sample_submission.head()

Unnamed: 0,ID,Y_LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [89]:
sample_submission["Y_LABEL"].sum()

801

In [91]:
# sample_submission.to_csv("../OUT/"+ str(datetime.today())[:10] + " Final submission.csv", index = False)