In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
import warnings
warnings.simplefilter("ignore", UserWarning)
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from collections import namedtuple
import sys
np.set_printoptions(threshold=sys.maxsize)

## Introduction
My approach to this month’s challenge was to use 3 algorithms. I used the GBDT, GOSS, and CatBoost and stack them together. There were 10 folds used where each fold was weighted by the validation AUC score. This output was collected along with the prediction on the test data. This information was then passed into a secondary model 

## Data Preparation
The method below is the primary data preparation step used to compress the original dataset read by pandas. 

In [None]:

def reduce_df(df):

    print(f"orginal dataset :{df.memory_usage().sum() / 1024 ** 2} mb")
    for i in df.columns:
        col_type = df[i].dtypes

        if str(col_type)[0:1] in ["i", "f"]:
            col_min, col_max = np.min(df[i]), np.max(df[i])
            if str(col_type)[0:1] == "i":
                for j in [np.int8,np.int16,np.int32, np.int64]:
                    if col_min > np.iinfo(j).min and col_max < np.iinfo(j).max:
                        df[i] = df[i].astype(j)
                        break
            else:
                for j in [np.float16,np.float32,np.float64]:
                    if col_min > np.finfo(j).min and col_max < np.finfo(j).max:
                        df[i] = df[i].astype(j)
                        break

    print(f"dataset reduced to :{df.memory_usage().sum() / 1024 ** 2} mb")
    print()
    return df


In [None]:
train = reduce_df(pd.read_csv("../input/tabular-playground-series-oct-2021/train.csv").set_index("id"))
test_o = reduce_df(pd.read_csv("../input/tabular-playground-series-oct-2021/test.csv"))

In [None]:
y = train["target"]
x = train.drop(columns = ["target"])
test = test_o.drop(columns=["id"])


folds = 10


## Hyperparameters
All hyperparameters were derived using Optuna. I removed the execution because this hyperparameter search took some time to run. 

In [None]:


gbdt_para = {'n_jobs':-1, 'n_estimators': 847, 'max_depth': 7, 'learning_rate': 0.04617423130344099, 'lambda_l1': 1.6029632425074436, 'lambda_l2': 0.0010928490115681689, 'num_leaves': 124, 'min_child_samples': 93, 'feature_fraction': 0.7917548593828119, 'bagging_fraction': 0.96375720421119, 'bagging_freq': 3}
goss_para = {"boosting_type": 'goss','n_jobs':-1,'n_estimators': 888, 'max_depth': 3, 'lambda_l1': 0.045547053858182196, 'lambda_l2': 1.290891976923166, 'num_leaves': 614, 'min_child_samples': 261, 'min_child_weight': 15.811750102552908}
cat_para = {'colsample_bylevel': 0.05606508594613661, 'depth': 4, 'learning_rate': 0.3840012528742531, 'bootstrap_type': 'Bernoulli', 'subsample': 0.645075461245303}

gbdt_ = LGBMClassifier(**gbdt_para)
goss_ = LGBMClassifier(**goss_para)
cat_ = CatBoostClassifier(**cat_para)



## Step 1

In this step meta model was generated. Using the 3 algorithms outlined in the introduction. Ten folds were used, and for each fold there was a AUC score recorded. These scores were used after the last fold as weighted averages for each fold. This process was repeated for all 3 models. The meta data was stored into a NumPy array. Also, the prediction for each model was also stored into a NumPy array. 


In [None]:
models_lst = []
models = namedtuple("models", "ind type fit_")
models_lst.append(models(ind = 0, type= "gbdt", fit_ = gbdt_))
models_lst.append(models(ind = 1, type="goss", fit_=goss_))
models_lst.append(models(ind = 2, type="cat", fit_=cat_))


df_split = ms.StratifiedKFold(n_splits=folds, shuffle=True, random_state=0)
train_meta_x = np.zeros((len(train.index), 3))
train_meta_y = np.zeros((len(train.index), 3))
weights = np.zeros((folds, 3))
fold_score = np.zeros((folds, 3))


fold_pred_cv = np.zeros((len(test.index) , folds))
fold_pred = np.zeros((len(test.index), 3))

for m in models_lst:

    start = 0
    end = 0

    for counter, (trn, val) in enumerate(df_split.split(x, y)):

        end += len(val)
        mod_ = m.fit_.fit(x.iloc[trn, :].values, y.iloc[trn])
        meta_pred = mod_.predict_proba(x.iloc[val, :])[:, 1]
        fold_pred_cv[:, counter] = mod_.predict_proba(test.values)[:, 1]
        train_meta_x[start:end, m.ind] = meta_pred
        train_meta_y[start:end, m.ind] = y.iloc[val]

        weights[counter, m.ind] = roc_auc_score(y.iloc[val], meta_pred)
        fold_score[counter, m.ind] = weights[counter, m.ind]
        print(counter)

        if counter == folds -1:

            weights[:, m.ind] = weights[:, m.ind]/np.sum(weights[:, m.ind], axis=0)
            fold_pred[:,m.ind] = np.dot(fold_pred_cv,weights[:, m.ind])
        start += len(val)

In [None]:
print(fold_score)

## Step 2

Below is the final step. Below the meta data is used with 10 folds to create a weighted array so the folds with a low AUC with have a lower effect than the folds with a higher AUC score.


In [None]:
score_weight = np.zeros((folds, 1))
meta_pred = np.zeros((len(fold_pred), folds))
final_data = np.zeros((len(fold_pred), 1))


In [None]:


df_split = ms.StratifiedKFold(n_splits=folds, shuffle=True, random_state=45)
for counter, (trn, val) in enumerate(df_split.split(train_meta_x, train_meta_y[:,0])):
    model = SGDClassifier(max_iter=10000, loss='log')
    model.fit(train_meta_x[trn, :], train_meta_y[trn, 0])
    meta_pred[:,counter] = model.predict_proba(fold_pred)[:, 1]
    pred = model.predict_proba(train_meta_x[val, :])[:, 1]
    score_weight[counter] = roc_auc_score(train_meta_y[val, 0], pred)
    if counter == folds - 1:
        score_weight = score_weight / np.sum(score_weight, axis=0)
        final_data = np.dot(meta_pred, score_weight)



## Output

In [None]:

final = pd.DataFrame(test_o["id"])
final = final.merge(pd.DataFrame(final_data), right_index=True, left_index=True)

final.columns = ["id", "target"]
final.to_csv("sub.csv", index=False)
