In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score

import matplotlib.pyplot as plt

In [3]:
df_train = pd.read_csv("layer_9_train.csv")
df_valid = pd.read_csv("layer_9_valid.csv")

In [4]:
labels = ['label_1', 'label_2', 'label_3', 'label_4']

df_train_X = df_train.copy()
df_train_X.drop(['label_2', 'label_3', 'label_4'], axis=1, inplace=True)
df_train_y = df_train_X.pop('label_1')

df_valid_X = df_valid.copy()
df_valid_X.drop(['label_2', 'label_3', 'label_4'], axis=1, inplace=True)
df_valid_y = df_valid_X.pop('label_1')

In [5]:
scaler = StandardScaler()

df_train_X_scaled = scaler.fit_transform(df_train_X)
df_valid_X_scaled = scaler.transform(df_valid_X)

In [6]:
# For SVC
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

# For LightGBM
lgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_samples': [10, 20, 30],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# For CatBoost
catboost_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'depth': [3, 4, 5],
    'l2_leaf_reg': [1, 3, 5],
    'colsample_bylevel': [0.6, 0.8, 1.0]
}

# For XGB
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.2],
    'reg_lambda': [0, 0.1, 0.2]
}


In [7]:
# For SVC
# svc_grid_search = GridSearchCV(SVC(), svc_param_grid, scoring='accuracy', cv=5)
svc_random_search = RandomizedSearchCV(SVC(), svc_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For LightGBM
# lgb_grid_search = GridSearchCV(LGBMClassifier(), lgb_param_grid, scoring='accuracy', cv=5)
lgb_random_search = RandomizedSearchCV(LGBMClassifier(), lgb_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For CatBoost
# catboost_grid_search = GridSearchCV(CatBoostClassifier(), catboost_param_grid, scoring='accuracy', cv=5)
catboost_random_search = RandomizedSearchCV(CatBoostClassifier(), catboost_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For XGB
xgb_random_search = RandomizedSearchCV(XGBClassifier(), xgb_param_grid, scoring='accuracy', cv=5, n_iter=3)


In [8]:
svc_random_search.fit(df_train_X_scaled, df_train_y)

In [36]:
lgb_random_search.fit(df_train_X_scaled, df_train_y)

[LightGBM] [Info] Number of positive: 18236, number of negative: 4580
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.176053 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22816, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799264 -> initscore=1.381699
[LightGBM] [Info] Start training from score 1.381699
[LightGBM] [Info] Number of positive: 18237, number of negative: 4579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.170607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22816, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799308 -> initscore=1.381972
[LightGBM] [Info] Start training from score 1.381972
[LightGBM] [

In [37]:
catboost_random_search.fit(df_train_X_scaled, df_train_y)

0:	learn: 0.6837767	total: 49.9ms	remaining: 14.9s
1:	learn: 0.6753482	total: 97.1ms	remaining: 14.5s
2:	learn: 0.6665743	total: 143ms	remaining: 14.2s
3:	learn: 0.6590470	total: 190ms	remaining: 14s
4:	learn: 0.6504583	total: 238ms	remaining: 14s
5:	learn: 0.6418000	total: 287ms	remaining: 14.1s
6:	learn: 0.6339315	total: 332ms	remaining: 13.9s
7:	learn: 0.6267875	total: 378ms	remaining: 13.8s
8:	learn: 0.6214789	total: 425ms	remaining: 13.7s
9:	learn: 0.6137683	total: 472ms	remaining: 13.7s
10:	learn: 0.6066471	total: 521ms	remaining: 13.7s
11:	learn: 0.5995949	total: 569ms	remaining: 13.6s
12:	learn: 0.5923885	total: 615ms	remaining: 13.6s
13:	learn: 0.5855887	total: 661ms	remaining: 13.5s
14:	learn: 0.5789954	total: 709ms	remaining: 13.5s
15:	learn: 0.5735145	total: 753ms	remaining: 13.4s
16:	learn: 0.5673264	total: 799ms	remaining: 13.3s
17:	learn: 0.5623329	total: 843ms	remaining: 13.2s
18:	learn: 0.5571162	total: 889ms	remaining: 13.2s
19:	learn: 0.5507248	total: 934ms	remaining

In [38]:
xgb_random_search.fit(df_train_X_scaled, df_train_y)

In [39]:
svc_test_score = svc_random_search.score(df_valid_X_scaled, df_valid_y)
lgb_test_score = lgb_random_search.score(df_valid_X_scaled, df_valid_y)
catboost_test_score = catboost_random_search.score(df_valid_X_scaled, df_valid_y)
xgb_test_score = xgb_random_search.score(df_valid_X_scaled, df_valid_y)



In [40]:
print("SVC: ", svc_random_search.best_params_)
print("LGB: ", lgb_random_search.best_params_)
print("CB: ", catboost_random_search.best_params_)
print("XGB: ", xgb_random_search.best_params_)

SVC:  {'kernel': 'linear', 'gamma': 'scale', 'C': 10}
LGB:  {'n_estimators': 200, 'min_child_samples': 10, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
CB:  {'learning_rate': 0.05, 'l2_leaf_reg': 1, 'iterations': 200, 'depth': 5, 'colsample_bylevel': 0.8}
XGB:  {'subsample': 0.9, 'reg_lambda': 0.2, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_child_weight': 2, 'max_depth': 4, 'learning_rate': 0.2, 'gamma': 0.1, 'colsample_bytree': 0.7}


In [41]:
svc_model = SVC(**svc_random_search.best_params_)
lgb_model = LGBMClassifier(**lgb_random_search.best_params_)
cb_model = CatBoostClassifier(**catboost_random_search.best_params_)
xgb_model = XGBClassifier(**xgb_random_search.best_params_)

In [42]:
base_models = [
    ('svc', svc_model),
    ('lgbm', lgb_model),
    ('catboost', cb_model),
    ('xgb', xgb_model)
]

meta_estimator = LogisticRegression(max_iter=1000)

In [43]:
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_estimator)

In [44]:
stacking_classifier.fit(df_train_X_scaled, df_train_y)

[LightGBM] [Info] Number of positive: 22796, number of negative: 5724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.233202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 28520, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799299 -> initscore=1.381917
[LightGBM] [Info] Start training from score 1.381917
0:	learn: 0.6429646	total: 69.1ms	remaining: 13.7s
1:	learn: 0.5981517	total: 147ms	remaining: 14.5s
2:	learn: 0.5613928	total: 225ms	remaining: 14.8s
3:	learn: 0.5290739	total: 310ms	remaining: 15.2s
4:	learn: 0.5024147	total: 388ms	remaining: 15.1s
5:	learn: 0.4822312	total: 465ms	remaining: 15s
6:	learn: 0.4638322	total: 543ms	remaining: 15s
7:	learn: 0.4412801	total: 623ms	remaining: 14.9s
8:	learn: 0.4234937	total: 701ms	remaining: 14.9s
9:	learn: 0.4088649	total: 779ms	remaining: 14.8s
10:	learn

In [45]:
stacking_preds = stacking_classifier.predict(df_valid_X_scaled)




In [46]:
acs = accuracy_score(df_valid_y, stacking_preds)
print("Accuracy Score: ", acs)

Accuracy Score:  0.9973333333333333
