In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
import functools

from sklearn.preprocessing import LabelEncoder, StandardScaler

import warnings
warnings.simplefilter('ignore')

from sklearn import model_selection
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.svm import SVC

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

from mlxtend.classifier import StackingCVClassifier
import shap
import optuna

In [2]:
RANDOM_SEED = 2021
PROBAS = True
FOLDS = 5
N_ESTIMATORS = 1000

TARGET = 'target'

In [3]:
train = pd.read_csv('../input/tabular-playground-series-may-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-may-2021/test.csv')
submission = pd.read_csv('../input/tabular-playground-series-may-2021/sample_submission.csv')

# Pseudo labels taken from great BIZEN notebook: https://www.kaggle.com/hiro5299834/tps-apr-2021-pseudo-labeling-voting-ensemble
# pseudo_labels = pd.read_csv("../input/sub-top/sub_top.csv")
# test[TARGET] = pseudo_labels[TARGET]

In [4]:
all_df = pd.concat([train, test], axis=0).drop(['id', 'target'], axis=1)

In [5]:
def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

scaler = StandardScaler()

train['target'] = label_encoder(train['target'])

for col in all_df.columns:
    all_df[col] = label_encoder(all_df[col])

for col in all_df.columns:
    all_df[col] = np.log1p(all_df[col])

In [6]:
train_df = all_df[:len(train)]
train_df['target'] = train['target']
test_df = all_df[len(train):]
all_df = train_df

In [7]:
X = all_df.drop([TARGET], axis = 1)
y = all_df[TARGET]

print (f'X:{X.shape} y: {y.shape} \n')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = RANDOM_SEED)
print (f'X_train:{X_train.shape} y_train: {y_train.shape}')
print (f'X_test:{X_test.shape} y_test: {y_test.shape}')

X:(100000, 50) y: (100000,) 

X_train:(90000, 50) y_train: (90000,)
X_test:(10000, 50) y_test: (10000,)


In [8]:
test = test_df
print (f'test:{test.shape}')

test:(50000, 50)


In [9]:
def opt2(X_train, y_train, X_test, y_test, trial):
    #param_list
    iterations = trial.suggest_int('iterations', 50, 300)                       
    depth = trial.suggest_int('depth', 4, 10)                                   
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)               
    random_strength = trial.suggest_int('random_strength', 0, 100)                     
    bagging_temperature = trial.suggest_loguniform('bagging_temperature', 0.01, 100.00)
#     learning_rate = trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
    od_type = trial.suggest_categorical('od_type', ['IncToDec', 'Iter'])


    cat_model = CatBoostClassifier(
        loss_function="Logloss",
        task_type="CPU",
        l2_leaf_reg=50,
        random_seed=2021,
        border_count=64,
        iterations = iterations,
        depth = depth,
        learning_rate = learning_rate,
        random_strength = random_strength,
        bagging_temperature = bagging_temperature,
        od_type = od_type,
        objective = 'MultiClass',
        verbose=False,
        classes_count = 4
    )        
    cat_model.fit(X_train, y_train)
    cat_tuna_pred_test = cat_model.predict_proba(X_test)
    
    return (log_loss(y_test, cat_tuna_pred_test))

In [10]:
study_cat = optuna.create_study()
study_cat.optimize(functools.partial(opt2, X_train, y_train, X_test, y_test), n_trials=100)

[32m[I 2021-05-06 05:13:48,805][0m A new study created in memory with name: no-name-d7ac1ba3-4335-47b0-b5cb-124bf07406c3[0m
[32m[I 2021-05-06 05:15:40,714][0m Trial 0 finished with value: 1.0848341859927393 and parameters: {'iterations': 278, 'depth': 10, 'learning_rate': 0.10543595551302834, 'random_strength': 41, 'bagging_temperature': 16.328623700406315, 'od_type': 'IncToDec'}. Best is trial 0 with value: 1.0848341859927393.[0m
[32m[I 2021-05-06 05:16:31,469][0m Trial 1 finished with value: 1.1059776204065426 and parameters: {'iterations': 128, 'depth': 10, 'learning_rate': 0.020010562125706045, 'random_strength': 82, 'bagging_temperature': 0.332809466620475, 'od_type': 'Iter'}. Best is trial 0 with value: 1.0848341859927393.[0m
[32m[I 2021-05-06 05:16:49,298][0m Trial 2 finished with value: 1.10371021301156 and parameters: {'iterations': 283, 'depth': 4, 'learning_rate': 0.01452255172978548, 'random_strength': 93, 'bagging_temperature': 0.7808986793225677, 'od_type': 'It

In [11]:
study_cat.best_params

{'iterations': 273,
 'depth': 4,
 'learning_rate': 0.25419918125985175,
 'random_strength': 65,
 'bagging_temperature': 0.14944250141671522,
 'od_type': 'IncToDec'}

In [12]:
cat = CatBoostClassifier(loss_function="Logloss",
        task_type="GPU",
        l2_leaf_reg=50,
        random_seed=2021,
        border_count=64, 
        objective = 'MultiClass',
        verbose=False,
        classes_count = 4,
        **study_cat.best_params)
cat.fit(X, y)

<catboost.core.CatBoostClassifier at 0x7fabaa3b00d0>

In [13]:
cat_preds = cat.predict_proba(test)

In [14]:
submission[submission.columns[1:]] = cat_preds

In [15]:
submission.to_csv('cat_optuna.csv', index=False)