In [1]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, precision_score, recall_score

import matplotlib.pyplot as plt

In [3]:
df_train = pd.read_csv("layer_10_train.csv")
df_valid = pd.read_csv("layer_10_valid.csv")

In [4]:
labels = ['label_1', 'label_2', 'label_3', 'label_4']

df_train_X = df_train.copy()
df_train_X.drop(['label_1', 'label_2', 'label_4'], axis=1, inplace=True)
df_train_y = df_train_X.pop('label_3')

df_valid_X = df_valid.copy()
df_valid_X.drop(['label_1', 'label_2', 'label_4'], axis=1, inplace=True)
df_valid_y = df_valid_X.pop('label_3')

In [5]:
scaler = StandardScaler()

df_train_X_scaled = scaler.fit_transform(df_train_X)
df_valid_X_scaled = scaler.transform(df_valid_X)

In [6]:
# For SVC
svc_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]
}

# For LightGBM
lgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_samples': [10, 20, 30],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# For CatBoost
catboost_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [100, 200, 300],
    'depth': [3, 4, 5],
    'l2_leaf_reg': [1, 3, 5],
    'colsample_bylevel': [0.6, 0.8, 1.0]
}

# For XGB
xgb_param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.2],
    'reg_lambda': [0, 0.1, 0.2]
}


In [7]:
# For SVC
# svc_grid_search = GridSearchCV(SVC(), svc_param_grid, scoring='accuracy', cv=5)
svc_random_search = RandomizedSearchCV(SVC(), svc_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For LightGBM
# lgb_grid_search = GridSearchCV(LGBMClassifier(), lgb_param_grid, scoring='accuracy', cv=5)
lgb_random_search = RandomizedSearchCV(LGBMClassifier(), lgb_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For CatBoost
# catboost_grid_search = GridSearchCV(CatBoostClassifier(), catboost_param_grid, scoring='accuracy', cv=5)
catboost_random_search = RandomizedSearchCV(CatBoostClassifier(), catboost_param_grid, scoring='accuracy', cv=5, n_iter=3)

# For XGB
xgb_random_search = RandomizedSearchCV(XGBClassifier(), xgb_param_grid, scoring='accuracy', cv=5, n_iter=3)


In [8]:
svc_random_search.fit(df_train_X_scaled, df_train_y)

In [9]:
lgb_random_search.fit(df_train_X_scaled, df_train_y)

[LightGBM] [Info] Number of positive: 18236, number of negative: 4580
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.223000 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22816, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799264 -> initscore=1.381699
[LightGBM] [Info] Start training from score 1.381699
[LightGBM] [Info] Number of positive: 18237, number of negative: 4579
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.171375 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 22816, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.799308 -> initscore=1.381972
[LightGBM] [Info] Start training from score 1.381972
[LightGBM] [

In [10]:
catboost_random_search.fit(df_train_X_scaled, df_train_y)

0:	learn: 0.6512301	total: 220ms	remaining: 43.8s
1:	learn: 0.6136047	total: 288ms	remaining: 28.5s
2:	learn: 0.5742183	total: 360ms	remaining: 23.7s
3:	learn: 0.5430568	total: 439ms	remaining: 21.5s
4:	learn: 0.5136352	total: 520ms	remaining: 20.3s
5:	learn: 0.4850022	total: 603ms	remaining: 19.5s
6:	learn: 0.4602760	total: 685ms	remaining: 18.9s
7:	learn: 0.4436632	total: 767ms	remaining: 18.4s
8:	learn: 0.4239925	total: 856ms	remaining: 18.2s
9:	learn: 0.4078203	total: 941ms	remaining: 17.9s
10:	learn: 0.3928612	total: 1.02s	remaining: 17.6s
11:	learn: 0.3777405	total: 1.11s	remaining: 17.5s
12:	learn: 0.3628693	total: 1.2s	remaining: 17.3s
13:	learn: 0.3504228	total: 1.29s	remaining: 17.2s
14:	learn: 0.3365001	total: 1.38s	remaining: 17s
15:	learn: 0.3266135	total: 1.47s	remaining: 16.9s
16:	learn: 0.3158971	total: 1.55s	remaining: 16.7s
17:	learn: 0.3056826	total: 1.64s	remaining: 16.6s
18:	learn: 0.2963232	total: 1.73s	remaining: 16.5s
19:	learn: 0.2885306	total: 1.82s	remaining:

KeyboardInterrupt: 

In [None]:
xgb_random_search.fit(df_train_X_scaled, df_train_y)

In [None]:
svc_test_score = svc_random_search.score(df_valid_X_scaled, df_valid_y)
lgb_test_score = lgb_random_search.score(df_valid_X_scaled, df_valid_y)
catboost_test_score = catboost_random_search.score(df_valid_X_scaled, df_valid_y)
xgb_test_score = xgb_random_search.score(df_valid_X_scaled, df_valid_y)



In [None]:
print("SVC: ", svc_random_search.best_params_)
print("LGB: ", lgb_random_search.best_params_)
print("CB: ", catboost_random_search.best_params_)
print("XGB: ", xgb_random_search.best_params_)

In [None]:
svc_model = SVC(**svc_random_search.best_params_)
lgb_model = LGBMClassifier(**lgb_random_search.best_params_)
cb_model = CatBoostClassifier(**catboost_random_search.best_params_)
xgb_model = XGBClassifier(**xgb_random_search.best_params_)

In [None]:
base_models = [
    ('svc', svc_model),
    ('lgbm', lgb_model),
    ('catboost', cb_model),
    ('xgb', xgb_model)
]

meta_estimator = LogisticRegression(max_iter=1000)

In [None]:
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=meta_estimator)

In [None]:
stacking_classifier.fit(df_train_X_scaled, df_train_y)

In [None]:
stacking_preds = stacking_classifier.predict(df_valid_X_scaled)




In [None]:
acs = accuracy_score(df_valid_y, stacking_preds)
print("Accuracy Score: ", acs)

Accuracy Score:  0.996


In [None]:
test_data = pd.read_csv('layer_10_test.csv')
test_data.drop(['label_1', 'label_2', 'label_3', 'label_4'], axis=1, inplace=True)
df_test_X_scaled = scaler.transform(test_data)

In [None]:
y_pred_test = stacking_classifier.predict(df_test_X_scaled)
predictions_df = pd.DataFrame({'Predictions': y_pred_test})
predictions_df.to_csv('predictions_label_3_layer10.csv', index=False)

