In [None]:
import pandas as pd
import numpy as np
import random
import time
import os

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from tqdm import tqdm

from catboost import CatBoostRegressor, CatBoostClassifier
from xgboost import XGBRegressor, XGBClassifier
from lightgbm import LGBMClassifier, LGBMRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, QuantileTransformer
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.ensemble import StackingRegressor, StackingClassifier

import optuna
from optuna.samplers import TPESampler

In [None]:
SEED = 2021
TARGET = "target"

In [None]:
def seed_everything(seed=2021):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(SEED)

# Using parquet format... much faster !!

https://www.kaggle.com/data2science/save-some-time-switch-to-parquet/notebook?kernelSessionId=76100662

In [None]:
X = pd.read_parquet(r"../input/tps-10-21-dataset-parquet/X.parquet")
y = pd.read_parquet(r"../input/tps-10-21-dataset-parquet/y.parquet")
X_test = pd.read_parquet(r"../input/tps-10-21-dataset-parquet/X_test.parquet")

## EDA

In [None]:
X.head(10)

* scaling seems ok : between 0 and 1
* features "f22", "f43" and above from  "f242" median egals to 0 or 1 -> binary features ?

In [None]:
X.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

* binary features

In [None]:
binary_features = []
for col in X.columns:
    if ((X[col]==0).astype('int') + (X[col]==1).astype('int')).sum() == len(X):
        binary_features.append(col)

print(f'Number of binary features : {len(binary_features)}')
if len(binary_features)>0: 
    print(binary_features)

* No missing values

In [None]:
print("Total of missing values:", pd.DataFrame(X.isna().sum(), columns =["# missing values"]).sum().values)
pd.DataFrame(X.isna().sum(), columns =["# missing values"]).T

* target distribution : even distibution (pokeball style 😊)

In [None]:
y_cnt = y.value_counts(normalize=True)
y_cnt.index = [f[0] for f in y_cnt.index]
y_cnt.plot.pie(label="ratio", colors = ['red', 'snow'])
plt.show()

* features distribution

some features might be categorical : f10, F18, ..

In [None]:
#non binary features
features_non_bin = X.columns.to_list()
for f in binary_features:
    features_non_bin.remove(f)

df_frac= pd.concat((X[features_non_bin],y), axis=1).sample(frac=0.1)
n_raw = int(len(features_non_bin)/8)+1
fig, axs = plt.subplots(n_raw, 8, figsize=(18,2*n_raw))

for i_col, ax in enumerate(tqdm(axs.flatten())):
    if i_col<len(df_frac.columns):
        f = df_frac.columns[i_col]
        sns.kdeplot(data = df_frac, x=f, hue=TARGET, ax=ax, legend=False)
        ax.set_title(f)
        ax.set_yticks([])
        ax.set_xlabel('')
        ax.set_ylabel('')
        ax.spines[['left', 'top', 'right']].set_visible(False)
    else:
        ax.set_xticks([])
        ax.set_yticks([])
        ax.spines[:].set_visible(False)
        
fig.tight_layout()

* binary features (f22 seems promising ...)

In [None]:
df_frac= pd.concat((X[binary_features],y), axis=1).sample(frac=0.1)
n_raw = int(len(binary_features)/8)+1
fig, axs = plt.subplots(n_raw, 8, figsize=(18,2*n_raw))
for i_col, ax in enumerate(tqdm(axs.flatten())):
    if i_col<len(binary_features):
        f = df_frac.columns[i_col]
        sns.countplot(data = df_frac, x=f, ax=ax, hue=TARGET)
        ax.get_legend().remove()
        ax.set_title(f)
        #ax.set_yticks([])
        ax.set_xlabel('')
        ax.set_ylabel('')
        ax.spines[['left', 'top', 'right']].set_visible(False)
    else:
        ax.set_xticks([])
        ax.set_yticks([])
        ax.spines[:].set_visible(False)
        
fig.tight_layout()