In [1]:
import optuna
import os
from pathlib import Path
import kaggle
import pandas as pd
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import numpy as np
from sklearn.preprocessing import FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import HistGradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
if iskaggle:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
else:
    path = Path('../input/playground-series-s3e23')
    orig_path = Path('../input/software-defect-prediction')
    if not path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)
    if not orig_path.exists():
        kaggle.api.competition_download_cli(str(path))
        zipfile.ZipFile(f'{path}.zip').extractall(path)

SEED = 42
SPLITS = 5
VERSION = 2
OUTPUT = f'ensemble_1_clf_submisson_v{VERSION}.csv'
N_TRIALS = 30

np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm




In [2]:
train = pd.read_csv(path/'train.csv', index_col = 'id')
test = pd.read_csv(path/'test.csv', index_col = 'id')
orig_train = pd.read_csv(orig_path/'jm1.csv')

In [3]:
for object_features in list(orig_train.loc[:, orig_train.dtypes == 'O']):
    orig_train[object_features] = orig_train[object_features].replace({'?' : np.nan}).astype('float64')

In [4]:
X = pd.concat([train, orig_train])
y = X.pop('defects')

# HistGradientBoostingClassifier

In [5]:
hist_grad_boost_clf_params = {'random_state': 42,
  'learning_rate': 0.010212498830869023,
  'l2_regularization': 53,
  'max_iter': 5167,
  'max_leaf_nodes': 24,
  'min_samples_leaf': 300,
  'max_bins': 186,
  'class_weight': 'balanced'}
hist_grad_boost_clf_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), StandardScaler(), 
                                             HistGradientBoostingClassifier(**hist_grad_boost_clf_params))

hist_grad_boost_clf_score = 0.7861680221392143

# XGBoostClassifier

In [6]:
xg_boost_clf_params = {'n_jobs': -1,
  'random_state': 42,
  'lambda': 0.04066835180694219,
  'alpha': 0.04259428467448357,
  'colsample_bytree': 1.0,
  'subsample': 0.6,
  'learning_rate': 0.02,
  'n_estimators': 400,
  'max_depth': 5,
  'min_child_weight': 80}
xg_boost_clf_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer(), XGBClassifier(**xg_boost_clf_params))

xg_boost_clf_score = 0.7863815140733784

# CatBoostClassifier

In [7]:
cat_boost_clf_params = {'n_estimators': 647,
  'learning_rate': 0.062126136754221545,
  'depth': 3,
  'l2_leaf_reg': 0.1675463052495161,
  'model_size_reg': 0.003576422478697353,
  'random_strength': 4.2456958775047355e-07,
  'subsample': 0.6625232716854229,
  'verbose': False}
cat_boost_clf_pipeline = make_pipeline(SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer(), CatBoostClassifier(**cat_boost_clf_params))

cat_boost_clf_score = 0.7858636509474153

# ExtraTreesClassfiers

In [8]:
extra_trees_clf_params = {'n_estimators': 500,
  'n_jobs': -1,
  'random_state': 42,
  'max_depth': 12,
  'min_samples_split': 101,
  'min_samples_leaf': 47,
  'criterion': 'gini',
  'max_features': 1.0}
extra_trees_clf_pipeline = make_pipeline(ColumnTransformer([('drop', 'drop',
                                  ['iv(g)', 't', 'b', 'n',
                                   'lOCode', 'v', 'branchCount',
                                   'e', 'i', 'lOComment'])],
                                remainder='passthrough'), SimpleImputer(), FunctionTransformer(func=np.log1p, validate=False), PowerTransformer(), 
                                         ExtraTreesClassifier(**extra_trees_clf_params))
extra_trees_clf_score = 0.785717273334426

# Voting Classifier

In [9]:
models= [
    ('HistGradientBoostingClassifier', hist_grad_boost_clf_pipeline),
    ('XGBClassifier', xg_boost_clf_pipeline),
    ('CatBoostClassifier', cat_boost_clf_pipeline),
    ('ExtraTreesClassifier', extra_trees_clf_pipeline)
]
model_weights = [0] * len(models)
model_scores = [hist_grad_boost_clf_score, xg_boost_clf_score, cat_boost_clf_score, extra_trees_clf_score]
total_score = np.sum(model_scores)

for i, score in enumerate(model_scores):
    model_weights[i] = model_scores[i] / total_score

In [10]:
voting_clf = VotingClassifier(models, voting='soft', weights=model_weights)
voting_clf.fit(X, y)

In [11]:
submission = test.copy()
submission['defects'] = voting_clf.predict_proba(submission)[:, 1]
submission.defects.to_csv(OUTPUT)