In [10]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score

import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv("layer_9_train.csv")
df_valid = pd.read_csv("layer_9_valid.csv")

In [3]:
labels = ['label_1', 'label_2', 'label_3', 'label_4']

df_train_X = df_train.copy()
df_train_X.drop(['label_1', 'label_2', 'label_4'], axis=1, inplace=True)
df_train_y = df_train_X.pop('label_3')

df_valid_X = df_valid.copy()
df_valid_X.drop(['label_1', 'label_2', 'label_4'], axis=1, inplace=True)
df_valid_y = df_valid_X.pop('label_3')

In [4]:
scaler = StandardScaler()

df_train_X_scaled = scaler.fit_transform(df_train_X)
df_valid_X_scaled = scaler.transform(df_valid_X)

In [5]:
# For SVC
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

# For LightGBM
lgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_samples': [10, 20, 30],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# For CatBoost
catboost_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'depth': [3, 4, 5],
    'l2_leaf_reg': [1, 3, 5],
    'colsample_bylevel': [0.6, 0.8, 1.0]
}

# For XGB
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.2],
    'reg_lambda': [0, 0.1, 0.2]
}


In [6]:
# For SVC
# svc_grid_search = GridSearchCV(SVC(), svc_param_grid, scoring='accuracy', cv=5)
svc_random_search = RandomizedSearchCV(SVC(), svc_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For LightGBM
# lgb_grid_search = GridSearchCV(LGBMClassifier(), lgb_param_grid, scoring='accuracy', cv=5)
lgb_random_search = RandomizedSearchCV(LGBMClassifier(), lgb_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For CatBoost
# catboost_grid_search = GridSearchCV(CatBoostClassifier(), catboost_param_grid, scoring='accuracy', cv=5)
catboost_random_search = RandomizedSearchCV(CatBoostClassifier(), catboost_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For XGB
xgb_random_search = RandomizedSearchCV(XGBClassifier(), xgb_param_grid, scoring='accuracy', cv=5, n_iter=3)


In [10]:
svc_random_search.fit(df_train_X_scaled, df_train_y)

In [8]:
lgb_random_search.fit(df_train_X_scaled, df_train_y)

[LightGBM] [Info] Number of positive: 18236, number of negative: 4580
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.206745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22816, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799264 -> initscore=1.381699
[LightGBM] [Info] Start training from score 1.381699
[LightGBM] [Info] Number of positive: 18237, number of negative: 4579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.163249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22816, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799308 -> initscore=1.381972
[LightGBM] [Info] Start training from score 1.381972
[LightGBM] [

In [9]:
catboost_random_search.fit(df_train_X_scaled, df_train_y)

0:	learn: 0.6857868	total: 200ms	remaining: 19.8s
1:	learn: 0.6779154	total: 240ms	remaining: 11.8s
2:	learn: 0.6708618	total: 290ms	remaining: 9.36s
3:	learn: 0.6632276	total: 336ms	remaining: 8.07s
4:	learn: 0.6562227	total: 407ms	remaining: 7.73s
5:	learn: 0.6494034	total: 465ms	remaining: 7.28s
6:	learn: 0.6408376	total: 535ms	remaining: 7.11s
7:	learn: 0.6344453	total: 593ms	remaining: 6.82s
8:	learn: 0.6277927	total: 668ms	remaining: 6.75s
9:	learn: 0.6213985	total: 729ms	remaining: 6.56s
10:	learn: 0.6139827	total: 805ms	remaining: 6.51s
11:	learn: 0.6068479	total: 882ms	remaining: 6.47s
12:	learn: 0.6007726	total: 946ms	remaining: 6.33s
13:	learn: 0.5938725	total: 1.03s	remaining: 6.31s
14:	learn: 0.5876833	total: 1.08s	remaining: 6.15s
15:	learn: 0.5816562	total: 1.17s	remaining: 6.12s
16:	learn: 0.5762849	total: 1.23s	remaining: 6s
17:	learn: 0.5707072	total: 1.31s	remaining: 5.98s
18:	learn: 0.5653235	total: 1.37s	remaining: 5.84s
19:	learn: 0.5601851	total: 1.45s	remaining:

In [7]:
xgb_random_search.fit(df_train_X_scaled, df_train_y)

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


In [11]:
svc_test_score = svc_random_search.score(df_valid_X_scaled, df_valid_y)
lgb_test_score = lgb_random_search.score(df_valid_X_scaled, df_valid_y)
catboost_test_score = catboost_random_search.score(df_valid_X_scaled, df_valid_y)
xgb_test_score = xgb_random_search.score(df_valid_X_scaled, df_valid_y)



In [12]:
print("SVC: ", svc_random_search.best_params_)
print("LGB: ", lgb_random_search.best_params_)
print("CB: ", catboost_random_search.best_params_)
print("XGB: ", xgb_random_search.best_params_)

SVC:  {'kernel': 'rbf', 'gamma': 0.001, 'C': 10}
LGB:  {'n_estimators': 100, 'min_child_samples': 10, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 0.6}
CB:  {'learning_rate': 0.1, 'l2_leaf_reg': 5, 'iterations': 300, 'depth': 5, 'colsample_bylevel': 0.6}
XGB:  {'subsample': 0.7, 'reg_lambda': 0, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.7}


In [13]:
svc_model = SVC(**svc_random_search.best_params_)
lgb_model = LGBMClassifier(**lgb_random_search.best_params_)
cb_model = CatBoostClassifier(**catboost_random_search.best_params_)
xgb_model = XGBClassifier(**xgb_random_search.best_params_)

In [14]:
base_models = [
    ('svc', svc_model),
    ('lgbm', lgb_model),
    ('catboost', cb_model),
    ('xgb', xgb_model)
]

meta_estimator = LogisticRegression(max_iter=1000)

In [15]:
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_estimator)

In [16]:
stacking_classifier.fit(df_train_X_scaled, df_train_y)

[LightGBM] [Info] Number of positive: 22796, number of negative: 5724
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.234877 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 28520, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799299 -> initscore=1.381917
[LightGBM] [Info] Start training from score 1.381917
0:	learn: 0.6151751	total: 69.7ms	remaining: 20.8s
1:	learn: 0.5534931	total: 153ms	remaining: 22.8s
2:	learn: 0.5003241	total: 227ms	remaining: 22.4s
3:	learn: 0.4493972	total: 311ms	remaining: 23s
4:	learn: 0.4142938	total: 396ms	remaining: 23.4s
5:	learn: 0.3766240	total: 470ms	remaining: 23s
6:	learn: 0.3535409	total: 546ms	remaining: 22.8s
7:	learn: 0.3334622	total: 623ms	remaining: 22.7s
8:	learn: 0.3130682	total: 695ms	remaining: 22.5s
9:	learn: 0.2995405	total: 768ms	remaining: 22.3s
10:	learn

In [17]:
stacking_preds = stacking_classifier.predict(df_valid_X_scaled)




In [18]:
acs = accuracy_score(df_valid_y, stacking_preds)
print("Accuracy Score: ", acs)

Accuracy Score:  0.996


In [22]:
test_data = pd.read_csv('layer_9_test.csv')
test_data.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
df_test_X_scaled = scaler.transform(test_data)

In [23]:
y_pred_test = stacking_classifier.predict(df_test_X_scaled)
predictions_df = pd.DataFrame({'Predictions': y_pred_test})
predictions_df.to_csv('predictions_label_1_layer9.csv', index=False)

