# IMPORTS

'pip install polars'

In [4]:
import numpy as np
import pandas as pd
#import polars as pl #Optimization for pandas
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

#For all combinations of list
from itertools import combinations

#For runtime calculation
from time import time

# Missing values visualization and  imputer
import missingno as msno
from sklearn.impute import KNNImputer, SimpleImputer

#To predict Gender from Name
from genderize import Genderize

#Multivariate Outlier Detection Algorithms
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM

# Scaling
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    Normalizer,
    PowerTransformer,
    QuantileTransformer,
    RobustScaler,
    StandardScaler
)

# Categorical to Numerical transformation
from category_encoders import  (OneHotEncoder,
                                OrdinalEncoder,
                                BinaryEncoder,
                                CountEncoder,
                                TargetEncoder,
                                WOEEncoder,
                                CatBoostEncoder)


# Model Evaluation Metrics
from sklearn.metrics import (roc_auc_score,
                             classification_report,
                             accuracy_score,
                             precision_score,
                             recall_score,
                             f1_score)

#Visualizations of metrics 
from sklearn.metrics import (RocCurveDisplay,
                             ConfusionMatrixDisplay,
                             PrecisionRecallDisplay)

#Feature Selection
from sklearn.feature_selection import RFE,mutual_info_classif,SelectKBest,SelectPercentile,SelectFromModel

#Dimensionality Reduction
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import Isomap,MDS, LocallyLinearEmbedding, SpectralEmbedding, TSNE

#Model Selection and CV
from sklearn.model_selection import train_test_split,cross_val_score,KFold
from sklearn.linear_model  import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,VotingClassifier
from sklearn.svm import SVC
from sklearn import neighbors
from sklearn.naive_bayes import GaussianNB

#from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
# Optuna
import optuna

#Pipeline Creation
#from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn import FunctionSampler
from sklearn.compose import ColumnTransformer

# To save models
import joblib

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 50)
from typing import Optional

# CUSTOM FUNCTIONS

## SELECT DTYPES

In [5]:
def checkNumCat(data,printVars=False,printClasses=False,needReturn=True):
    '''
    Print numerical, categorical, continuous and discrete variables and total classes of categories
    
    data => DataFrame
    
    printVars => Default True; True or False, to print numerical, categorical, continuous and discrete variables set True otherwise set False
    
    printClasses => Default True; True or False, to print number of classes of categorical variables set True otherwise set False
    
    needReturn => Default True; True or False, to return types of categories set True otherwise set False
    
    return : numerical_variables,continuous_variables,num_discrete_variables,cat_discrete_variables,binary_variables,categorical_variables
    '''
    numerical_variables = [feature for feature in data.columns if data[feature].dtype in ['int_', 'int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
                               'uint32', 'uint64','float_', 'float16', 'float32','float64']]
    continuous_variables =[feature for feature in numerical_variables if data[feature].nunique() > 25] 
    num_discrete_variables =[feature for feature in numerical_variables if ((data[feature].nunique() < 25) & (data[feature].nunique() > 2) )]     
    categorical_variables = [feature for feature in data.columns if data[feature].dtype in ['O','bool_','category']]
    cat_discrete_variables =[feature for feature in categorical_variables if ((data[feature].nunique() < 25) & (data[feature].nunique() > 2) )] 
    binary_variables = [feature for feature in data.columns if data[feature].nunique()==2]
    
    if printVars:    
        print('Number of numerical variables =>',len(numerical_variables),'\nNumerical Variables=>',numerical_variables)
        print('======================')    
        print('Number of continuous variables =>',len(continuous_variables),'\nContinuous Variables=>',continuous_variables)
        print('======================')    
        print('Number of numerical discrete variables =>',len(num_discrete_variables),'\nNumerical Discrete Variables=>',num_discrete_variables)
        print('======================')
        print('Number of categorical discrete variables =>',len(cat_discrete_variables),'\nCategorical Discrete Variables=>',cat_discrete_variables)
        print('======================')
        print('Number of binary variables =>',len(binary_variables),'\nBinary Variables=>',binary_variables)
        print('======================')  
        print('Number of categorical variables =>',len(categorical_variables),'\nCategorical Variables=>',categorical_variables)
    
        
    if printClasses:
        #Uniques only in categories and sum of them
        print('\n'*3,'*'*5,'Number of Unique in Categories','*'*5)
        df_uniqo = pd.DataFrame(data=data[categorical_variables].nunique(),columns=['Number_of_Uniques'])
        df_uniqo.loc[df_uniqo.shape[0]] = df_uniqo.sum()
        indexos = list(df_uniqo.index)
        indexos.remove(df_uniqo.shape[0]-1)
        indexos.append('Total_Number_of_Classes')
        df_uniqo = df_uniqo.set_axis(indexos)
        display(df_uniqo)
    if needReturn:
        return numerical_variables,continuous_variables,num_discrete_variables,cat_discrete_variables,binary_variables,categorical_variables

## FEATURE ENGINEERING CLASS

In [6]:
class FeatureEngineering():
    
    def __init__(self):
       pass
    def fit(self, X: pd.DataFrame, y=None):
        pass
    def fit_transform(self, X: pd.DataFrame, y=None):
        self.fit(X)
        return self.transform(X)
    def transform(self, X: pd.DataFrame, y=None):        
        X_new = X.copy()
        
        X_new['Group'] = X_new['PassengerId'].str.split('_').str[0].astype('uint16')
        X_new['GroupSize'] = X_new.groupby(['Group']).transform('size')
        X_new['GroupMeanAge'] = X_new.groupby(['Group'])['Age'].transform('mean')
        X_new['IsAlone'] = np.where(X_new['GroupSize']==1, 1, 0)
        
        X_new['LastName'] = X_new['Name'].str.split(' ').str[1]
        #X_new['FirstName'] = X_new['Name'].str.split(' ').str[0]
        X_new['FamilySize'] = X_new.groupby(['Group','LastName']).transform('size').astype('float32')
        X_new['HasFamily'] = np.where(X_new['FamilySize']>1,1,0).astype('uint8')
        X_new['FamilyName'] = np.where(X_new['HasFamily']==1,X_new['LastName']+'_Family','NoFamily')
        
        X_new['Expense'] = X_new[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1).astype('float32')
        X_new['HasExpense'] = np.where(X_new['Expense']>0,1,0).astype('uint8')
        X_new['GroupTotalExpense'] = X_new.groupby(['Group'])['Expense'].transform('sum')
        X_new['FamilyExpense'] = X_new.groupby(['Group','LastName'])['Expense'].transform('sum')
        X_new['NumAmenitiesUsed'] = X_new[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].gt(0).sum(axis=1)
        X_new['NecessityExpense'] = X_new[['RoomService', 'FoodCourt', 'ShoppingMall']].sum(axis=1)
        X_new['LuxuryExpense'] = X_new[['Spa', 'VRDeck']].sum(axis=1)
        
        X_new['CabinDeck'] = X_new['Cabin'].str.split('/').str[0].astype('category')
        X_new['CabinNum'] = X_new['Cabin'].str.split('/').str[1].astype('float32')
        X_new['CabinSide'] = X_new['Cabin'].str.split('/').str[2].astype('category')
        
        X_new['CabinGroupSize'] = X_new.groupby('Cabin')['PassengerId'].transform('count')
        
        X_new['IsMinor'] = np.where(X_new['Age']<18,1,0)
        
        bins = [0, 12, 18, 35, 60, 120]
        labels = ['Child', 'Teen', 'Young Adult', 'Adult', 'Senior']
        X_new['AgeBin'] = pd.cut(X_new['Age'], bins=bins, labels=labels,include_lowest=True).astype('category')
        
        groupLabels = ["GroupBin{}".format(i) for i in range(1,21)]
        X_new['GroupBin'] = pd.cut(X_new['Group'], bins=20, labels=groupLabels)
        
        #Combinations
        X_new['HomePlanet_Destination'] = X_new.apply(lambda row: row['HomePlanet'] + '->' + row['Destination'] 
                                                if pd.notna(row['HomePlanet']) and pd.notna(row['Destination']) 
                                                else np.nan, axis=1)
        #PassengerId,Cabin,Name,Surname
        #X_new.drop(columns=drop_cols,inplace=True)
        #display(pd.DataFrame(X_new,columns=X_new.columns))
        x_new_columns = list(X_new.select_dtypes(exclude=['category','object']).columns)+list(X_new.select_dtypes(include=['category','object']).columns)
        return pd.DataFrame(X_new,columns=x_new_columns)        

In [7]:
class FeatureRemover():
    # can use  in Column Transformer
    def __init__(self,df_columns,only_necessaries):
        self.df_columns=df_columns              
        self.only_necessaries = only_necessaries
    def fit(self, X: pd.DataFrame, y=None):
        pass
    def fit_transform(self, X: pd.DataFrame, y=None):
        self.fit(X)
        return self.transform(X)
    def transform(self, X: pd.DataFrame, y=None):   
        df = X.copy()        
        dicto = {}
        for i,cat in enumerate(self.df_columns):
            dicto.update({cat:i})
        if self.only_necessaries:
            features = ['PassengerId','Cabin','Name','LastName','FamilyName','CabinNum','Age','GroupMeanAge','VIP']
        else:
            features = ['PassengerId','Cabin','Name','LastName','FamilyName', 'HomePlanet_Destination' ,'CabinNum' ,'CabinGroupSize' ,'GroupMeanAge' ,'Age' ,'AgeBin' ,'IsAlone' ,'GroupSize' ,'CabinSide' ,'Destination' ,'IsMinor' ,'FamilySize' ,'GroupBin' ,'VIP' ,'GroupTotalExpense' ,'FamilyExpense'] 
        col_indexes =  [dicto[feature] for feature in self.df_columns if feature in features]
        #deleted_cols = [feature for feature in self.df_columns if feature in features]
        #print(col_indexes,deleted_cols)
        
        df = np.delete(df,col_indexes,axis=1)
        return df

In [11]:
numerical_columns = list(FeatureEngineering().fit_transform(X_train,y_train).select_dtypes(exclude=['category','object']).columns)
categorical_columns = list(FeatureEngineering().fit_transform(X_train,y_train).select_dtypes(include=['category','object']).columns)

In [12]:
dicto = {}
for i,cat in enumerate(numerical_columns+categorical_columns):
    dicto.update({cat:i})

In [13]:
[(dicto[feature],feature) for feature in numerical_columns+categorical_columns]

[(0, 'Age'),
 (1, 'RoomService'),
 (2, 'FoodCourt'),
 (3, 'ShoppingMall'),
 (4, 'Spa'),
 (5, 'VRDeck'),
 (6, 'Group'),
 (7, 'GroupSize'),
 (8, 'GroupMeanAge'),
 (9, 'IsAlone'),
 (10, 'FamilySize'),
 (11, 'HasFamily'),
 (12, 'Expense'),
 (13, 'HasExpense'),
 (14, 'GroupTotalExpense'),
 (15, 'FamilyExpense'),
 (16, 'NumAmenitiesUsed'),
 (17, 'NecessityExpense'),
 (18, 'LuxuryExpense'),
 (19, 'CabinNum'),
 (20, 'CabinGroupSize'),
 (21, 'IsMinor'),
 (22, 'PassengerId'),
 (23, 'HomePlanet'),
 (24, 'CryoSleep'),
 (25, 'Cabin'),
 (26, 'Destination'),
 (27, 'VIP'),
 (28, 'Name'),
 (29, 'LastName'),
 (30, 'FamilyName'),
 (31, 'CabinDeck'),
 (32, 'CabinSide'),
 (33, 'AgeBin'),
 (34, 'GroupBin'),
 (35, 'HomePlanet_Destination')]

In [312]:
np.delete(np.array(df_fe),[0,1],axis=1)

array([[False, 'B/0/P', 'TRAPPIST-1e', ..., 0, 'GroupBin1',
        'Europa->TRAPPIST-1e'],
       [False, 'F/0/S', 'TRAPPIST-1e', ..., 0, 'GroupBin1',
        'Earth->TRAPPIST-1e'],
       [False, 'A/0/S', 'TRAPPIST-1e', ..., 0, 'GroupBin1',
        'Europa->TRAPPIST-1e'],
       ...,
       [False, 'G/1500/S', 'TRAPPIST-1e', ..., 0, 'GroupBin20',
        'Earth->TRAPPIST-1e'],
       [False, 'E/608/S', '55 Cancri e', ..., 0, 'GroupBin20',
        'Europa->55 Cancri e'],
       [False, 'E/608/S', 'TRAPPIST-1e', ..., 0, 'GroupBin20',
        'Europa->TRAPPIST-1e']], dtype=object)

## RBI

In [8]:
class rule_based_imputation():
    from joblib import delayed,parallel
    def __init__(self):
       pass
    def fit(self, X: pd.DataFrame, y=None):
        pass
    def fit_transform(self, X: pd.DataFrame, y=None,printing=False):
        self.fit(X)
        return self.transform(X)
    def transform(self, X: pd.DataFrame, y=None,printing=False):
        df = X.copy()
        # rbi_cols must be pre-defined for every dataset
        rbi_cols=[('Group','HomePlanet'),
                ('Group','CabinSide'),
                ('Cabin','HomePlanet'),
                ('LastName','HomePlanet')]
        if printing:
            before_nan = df.isna().sum().sum()
            start_time = time()
            print('Number of NaN before RBI:',before_nan)
        
        df.loc[(df['GroupSize'] == 8), 'HomePlanet'] = 'Earth' 
        df.loc[(df['GroupSize'] == 8), 'VIP'] = False
        df.loc[(df['FamilySize'] == 7), 'VIP'] = False   
        df.loc[df['CryoSleep'] == True, ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = [0, 0, 0, 0, 0]
        df.loc[df['AgeBin'] == 'Child', ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = [0, 0, 0, 0, 0]
        df.loc[df['NumAmenitiesUsed'].isin([1, 2, 3, 4, 5]), 'CryoSleep'] = False  
        df.loc[(df['HasExpense'] == 1), 'CryoSleep'] = False
        #df.loc[df['CabinGroupSize'] == 8, 'VIP'] = False
        #df.loc[df['CabinGroupSize'] == 8, 'CabinDeck'] = 'G'
        #df.loc[df['CabinDeck'] == 'T', 'CryoSleep'] = False    
        df.loc[df['CabinDeck'] == 'G', 'VIP'] = False
        df.loc[df['AgeBin'] == 'Child', 'VIP'] = False
        df.loc[df['HomePlanet'] == 'Earth', 'VIP'] = False
        #df.loc[(df['HomePlanet_Destination'] == 'Mars->55 Cancri e'), 'VIP'] = False
        df.loc[df['CabinDeck'].isin(['A', 'B', 'C']), 'HomePlanet'] = 'Europa'
        df.loc[df['CabinDeck'] == 'G', 'HomePlanet'] = 'Earth'    
        
        if printing:    
            print('Middle Execution time:',time()-start_time)
        for col1,col2 in rbi_cols:  
            if printing:
                temp_start_time = time()
            #To check below codes, uncomment prints below. If after RBI, NaN numbers are very low be sure its correctness
            #print(col2,'Number of NaN before RBI:',df[col2].isna().sum())   
            col1_col2_null = df[[col1,col2]].loc[df[col2].isna()][col1].unique()
            col1df = df[[col1,col2]].loc[df[col1].isin(col1_col2_null)][[col1,col2]] # df includes col2 values given col1
            #col1df[col1].unique()
            for col1_value in col1df[col1].unique():
                if not pd.isna(col1_value):
                    col1_col2 = df.loc[df[col1] == col1_value, col2].mode()
                    if not col1_col2.empty:
                        df.loc[df[col1] == col1_value, col2] = col1_col2[0]
            if printing : 
                print(f'({col1},{col2}) Exe Time:',time()-temp_start_time)
            #print(col2,'Number of NaN after RBI:',df[col2].isna().sum())
    
        # Add here features created with combinations (If combinated columns in row , write this above rbi_cols processes or seperate 
        # combinated columns and others)
        df.loc[df['HomePlanet_Destination'].isna(),'HomePlanet_Destination'] = df.loc[df['HomePlanet_Destination'].isna()]\
                                                    .apply(lambda row: row['HomePlanet'] + '->' + row['Destination'] 
                                                    if pd.notna(row['HomePlanet']) and pd.notna(row['Destination']) 
                                                    else np.nan, axis=1)        
        if printing:
            end_time = time()
            after_nan = df.isna().sum().sum()
            print('Number of NaN after RBI:',after_nan)
            print('%',(1-after_nan/before_nan)*100,'(count:{})'.format(before_nan-after_nan),'of NaN filled with RBI !')
            print('Execution Time:',end_time-start_time)
        return df
    
        

In [19]:
rbi_pipe =Pipeline([
    ('feature_eng',FeatureEngineering()),
    ('rbi',rule_based_imputation()),
    ('feature_remover',FeatureRemover(numerical_columns))
])

In [20]:
pd.DataFrame(rbi_pipe.fit_transform(df_train)) 

[0, 7, 8, 9, 10, 14, 15, 19, 20, 21] ['Age', 'GroupSize', 'GroupMeanAge', 'IsAlone', 'FamilySize', 'GroupTotalExpense', 'FamilyExpense', 'CabinNum', 'CabinGroupSize', 'IsMinor']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,Maham Ofracculy,False,39.0,1,Ofracculy,0.0,0,0.0,0.0,0,0.0,0.0,B,0.0,P,1.0,0,Adult,GroupBin1,Europa->TRAPPIST-1e
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,44.0,Juanna Vines,True,24.0,1,Vines,736.0,1,736.0,736.0,5,143.0,593.0,F,0.0,S,1.0,0,Young Adult,GroupBin1,Earth->TRAPPIST-1e
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,49.0,Altark Susent,False,45.5,0,Susent,10383.0,1,15559.0,15559.0,4,3619.0,6764.0,A,0.0,S,2.0,0,Adult,GroupBin1,Europa->TRAPPIST-1e
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,193.0,Solam Susent,False,45.5,0,Susent,5176.0,1,15559.0,15559.0,4,1654.0,3522.0,A,0.0,S,2.0,0,Young Adult,GroupBin1,Europa->TRAPPIST-1e
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,2.0,Willy Santantines,True,16.0,1,Santantines,1091.0,1,1091.0,1091.0,5,524.0,567.0,F,1.0,S,1.0,1,Teen,GroupBin1,Earth->TRAPPIST-1e
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,74.0,Gravior Noxnuther,False,41.0,1,Noxnuther,8536.0,1,8536.0,8536.0,3,6819.0,1717.0,A,98.0,P,1.0,0,Adult,GroupBin20,Europa->55 Cancri e
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,Kurta Mondalley,False,18.0,1,Mondalley,0.0,0,0.0,0.0,0,0.0,0.0,G,1499.0,S,1.0,0,Teen,GroupBin20,Earth->PSO J318.5-22
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,Fayey Connon,True,26.0,1,Connon,1873.0,1,1873.0,1873.0,2,1872.0,1.0,G,1500.0,S,1.0,0,Young Adult,GroupBin20,Earth->TRAPPIST-1e
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,3235.0,Celeon Hontichre,False,38.0,0,Hontichre,4637.0,1,9463.0,9463.0,3,1049.0,3588.0,E,608.0,S,2.0,0,Young Adult,GroupBin20,Europa->55 Cancri e


In [27]:
'Age' in (numerical_columns+categorical_columns)

True

In [44]:
rbi_pipe =Pipeline([
    ('feature_eng',FeatureEngineering()),
    ('rbi',rule_based_imputation()),
    ('feature_remover',FeatureRemover(categorical_columns))
])

In [48]:
df_train[list(df_train.select_dtypes(include=['object','category']))]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,False,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,False,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,True,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,False,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,False,Willy Santantines
...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,True,Gravior Noxnuther
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,False,Kurta Mondalley
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,False,Fayey Connon
8691,9280_01,Europa,False,E/608/S,55 Cancri e,False,Celeon Hontichre


In [47]:
pd.DataFrame(rbi_pipe.fit_transform()) 

KeyError: 'Column not found: Age'

## PIPELINE GENERATOR

In [9]:
def pipeline_generator(num_variables,cat_variables,feature_engineering,scaler_name,knn_imputer_n_neighbors,cat_encoder_name,dim_red_name,model_name,
                       dim_red_params=None,model_params=None,first_scaler='passthrough'):
    '''
    num_variables => Numerical variables [List]
    cat_variables => Categorical variables [List]
    feature_engineering => Feature engineering Function (Class Instance)
    dim_red_params => Dimensionality Reduction Algorithm's parameters {Dictionary}
    model_params => Model's parameters {Dictionary}
    first_scaler => 'passthrough' or 'bMinMax'  'str' 
    All names mean names 'str'
    '''

    
    if cat_encoder_name == 'OneHotEncoder':
        encoder_obj = OneHotEncoder()
    elif cat_encoder_name == 'OrdinalEncoder':
        encoder_obj = OrdinalEncoder()
    elif cat_encoder_name == 'BinaryEncoder':
        encoder_obj = BinaryEncoder()
    elif cat_encoder_name == 'CountEncoder':
        encoder_obj = CountEncoder()
    else:
        encoder_obj = WOEEncoder() 
#====================================================
    
    if scaler_name == 'StandardScaler':
        scaler_obj = StandardScaler(with_mean=False)
    elif scaler_name == 'MinMaxScaler':
        scaler_obj = MinMaxScaler()
    elif scaler_name == 'QuantileTransformer':
        scaler_obj = QuantileTransformer(output_distribution='normal')
    elif scaler_name == 'RobustScaler':
        scaler_obj = RobustScaler(quantile_range=(0.05,0.95))
    else: 
        scaler_obj = PowerTransformer()
    
    if first_scaler == 'bMinMax':
        first_scaler= MinMaxScaler()
#====================================================
    if dim_red_name == 'PCA':
        dim_red_obj = PCA(n_components=0.95)
    elif dim_red_name == 'KernelPCA':
        dim_red_obj = KernelPCA(n_components=4 , kernel = 'sigmoid')
    elif dim_red_name == 'LinearDiscriminantAnalysis':
        dim_red_obj = LinearDiscriminantAnalysis()
    elif dim_red_name == 'Isomap':
        dim_red_obj = Isomap(n_components=4)
    elif dim_red_name == 'LocallyLinearEmbedding':
        dim_red_obj = LocallyLinearEmbedding(n_neighbors=10,n_components=4)

    if dim_red_params != None: 
        dim_red_obj.set_params(**dim_red_params)
#====================================================
    if model_name == 'RidgeClassifier':
        model_obj = RidgeClassifier()
    elif model_name == 'LogisticRegression':
        model_obj = LogisticRegression()
    elif model_name == 'KNeighborsClassifier':
        model_obj = KNeighborsClassifier()
    elif model_name == 'AdaBoostClassifier':
        model_obj = AdaBoostClassifier(learning_rate=0.5, random_state=42)
    elif model_name == 'GradientBoostingClassifier':
        model_obj = GradientBoostingClassifier()
    elif model_name == 'SVC':
        model_obj = SVC()
    elif model_name == 'NaiveBayes':
        model_obj = GaussianNB()
    else:
        model_obj = RandomForestClassifier()

    if model_params != None:
        model_obj.set_params(**model_params)
#====================================================
    #Sayısal veriler için gerekli işlemler
    numeric_transformer = Pipeline(
        steps=[('imputer_num',KNNImputer(n_neighbors=knn_imputer_n_neighbors))]
    )
    
    #kategorik veriler için gerekli işlemler
    categorical_transformer = Pipeline(
        steps = [('imputer_cat',SimpleImputer(strategy='most_frequent')),
                 ('encoder',encoder_obj),
                ]
    )
    
    #Farklı veri tipleri için yapılan işlemlerin birleştirilmesi
    preprocessors = ColumnTransformer(
        transformers=[
            ("numerical",numeric_transformer,num_variables),
            ('categorical',categorical_transformer,cat_variables)        
        ]
    )
    
    predictor_pipe = Pipeline(
        steps=[('feature_engineering',feature_engineering),
               ('preprocessors',preprocessors),
               ('first_scaler',first_scaler),
               ('second_scaler', scaler_obj),
               ('dimensionality_reduction',dim_red_obj),
                ('model',model_obj)
               ],
    )

    return predictor_pipe

## METRIC MEASUREMENT

In [10]:
def model_metrics_clf(model,X_train,y_train,X_test,y_test):
    fit_start_train =time()
    model.fit(X_train.copy(),y_train)
    fit_end_train = time()
    
    predict_start_test =time()
    y_pred = model.predict(X_test)
    predict_end_test = time()
    print('\n\n\nTest Score with \n{}'.format(classification_report(y_test,y_pred)))
    print('Total Fit Time (Train):{} second\nTotal Predict Time(Test):{} second'.format(fit_end_train-fit_start_train,predict_end_test-predict_start_test))
    #print(str(classification_report(y_tst,y_pred)))
    f,_ = plt.subplots(1,2)
    f.set_figheight(6)
    f.set_figwidth(15)
    ax1= plt.subplot(1,2,1)
    ax2 = plt.subplot(1,2,2)
    RocCurveDisplay.from_predictions(y_true=y_test, y_pred=y_pred,ax=ax1)
    ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_pred,ax=ax2)
    f.suptitle(t='Visualization of Test Scores\nLeft Roc AUC Score , Right Confusion Matrix')
    plt.show()
    
    predict_start_train =time()
    y_pred_tr = model.predict(X_train.copy())
    predict_end_train = time()
    print('\n\n\nTrain Score with \n{}'.format(classification_report(y_train,y_pred_tr)))
    print('Total Fit Time (Train):{} second\nTotal Predict Time(Test):{} second'.format(fit_end_train-fit_start_train,predict_end_train-predict_start_train))
    #print(str(classification_report(y_tr,y_pred_tr)))
    f,_ = plt.subplots(1,2)
    f.set_figheight(6)
    f.set_figwidth(15)
    f.suptitle(t='Visualization of Train Scores\nLeft Roc AUC Score , Right Confusion Matrix')
    ax1= plt.subplot(1,2,1)
    ax2 = plt.subplot(1,2,2)
    RocCurveDisplay.from_predictions(y_true=y_train, y_pred=y_pred_tr,ax=ax1)
    ConfusionMatrixDisplay.from_predictions(y_true=y_train, y_pred=y_pred_tr,ax=ax2)
    plt.show()

# GET DATA

In [11]:
df_train = pd.read_csv('train.csv')
df_test =pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')

In [12]:
X_train,y_train,X_test,y_test = df_train.drop(columns='Transported'),df_train['Transported'],df_test,df_sub['Transported']

# Baseline Models and Hyperparameter Optimization with Optuna

## Baseline Model

### Pipe Funcs

In [13]:
def instantiate_cat_encoder(trial):
    #Choosing the Categorical encoder
    cat_encoder_name = trial.suggest_categorical('cat_encoder_name',['OneHotEncoder','OrdinalEncoder','BinaryEncoder','CountEncoder','WOEEncoder'])
    if cat_encoder_name == 'OneHotEncoder':
        encoder_obj = OneHotEncoder()
    elif cat_encoder_name == 'OrdinalEncoder':
        encoder_obj = OrdinalEncoder()
    elif cat_encoder_name == 'BinaryEncoder':
        encoder_obj = BinaryEncoder()
    elif cat_encoder_name == 'CountEncoder':
        encoder_obj = CountEncoder()
    else:
        encoder_obj = WOEEncoder()

    return encoder_obj

In [14]:
def instantiate_scaler(trial):
    scaler_name = trial.suggest_categorical('scaler_name',['StandardScaler','MinMaxScaler','QuantileTransformer','RobustScaler','PowerTransformer'])
    before_transformer = 'passthrough'
    if scaler_name == 'StandardScaler':
        scaler_obj = StandardScaler(with_mean=False)
    elif scaler_name == 'MinMaxScaler':
        scaler_obj = MinMaxScaler()
    elif scaler_name == 'QuantileTransformer':
        scaler_obj = QuantileTransformer(output_distribution='normal')
    elif scaler_name == 'RobustScaler':
        scaler_obj = RobustScaler(quantile_range=(0.05,0.95))
    else:
        bTransformer_name = trial.suggest_categorical('bTransformer_name',['passthrough','bMinMax'])
        if bTransformer_name == 'bMinMax':
            before_transformer = MinMaxScaler()
        scaler_obj = PowerTransformer()

    return scaler_obj,before_transformer

In [15]:
def instantiate_dim_red(trial):
    ## Choosing the dimensionality reduction algorithm //'SpectralEmbedding','TSNE'
    dim_red_name = trial.suggest_categorical('dim_red_name',['PCA','KernelPCA','LinearDiscriminantAnalysis','Isomap','LocallyLinearEmbedding'])
    
    if dim_red_name == 'PCA':
        dim_red_obj = PCA(n_components=0.95)
    elif dim_red_name == 'KernelPCA':
        dim_red_obj = KernelPCA(n_components=4 , kernel = 'sigmoid')
    elif dim_red_name == 'LinearDiscriminantAnalysis':
        dim_red_obj = LinearDiscriminantAnalysis()
    elif dim_red_name == 'Isomap':
        dim_red_obj = Isomap(n_components=4)
    elif dim_red_name == 'LocallyLinearEmbedding':
        dim_red_obj = LocallyLinearEmbedding(n_neighbors=10,n_components=4)
    elif dim_red_name == 'SpectralEmbedding':
        dim_red_obj = SpectralEmbedding(n_components=4)
    else:
        dim_red_obj = TSNE(n_components=4)

    return dim_red_obj

In [16]:
def instantiate_model(trial):
    model_name = trial.suggest_categorical('classifier_name',['RidgeClassifier','LogisticRegression','KNeighborsClassifier',\
                                                          'AdaBoostClassifier','GradientBoostingClassifier','SVC',\
                                                          'RandomForestClassifier','NaiveBayes','XGBoost','CatBoost'])
    if model_name == 'RidgeClassifier':
        model_obj = RidgeClassifier()
    elif model_name == 'LogisticRegression':
        model_obj = LogisticRegression()
    elif model_name == 'KNeighborsClassifier':
        model_obj = KNeighborsClassifier()
    elif model_name == 'AdaBoostClassifier':
        model_obj = AdaBoostClassifier(learning_rate=0.5, random_state=42)
    elif model_name == 'GradientBoostingClassifier':
        model_obj = GradientBoostingClassifier()
    elif model_name == 'SVC':
        model_obj = SVC()
    elif model_name == 'NaiveBayes':
        model_obj = GaussianNB()
    elif model_name == 'XGBoost':
        model_obj = XGBClassifier()
    elif model_name == 'CatBoost':
        model_obj = CatBoostClassifier()
    else:
        model_obj = RandomForestClassifier()

    return model_obj

### Objective Func

In [17]:
def objective_base_functionized(trial):
    
    #KNNImputer n neighbors
    knn_imp_n_neighbors = trial.suggest_int('knn_imp_n_neighbors',2,6)
    #FeatureRemover most necessary and necessary choose
    feature_remover_necessary = trial.suggest_categorical('feature_remover_necessary',[True,False])
    #Choosing the Categorical encoder
    encoder_obj = instantiate_cat_encoder(trial)
    #Choosing the scaler
    scaler_obj,before_transformer = instantiate_scaler(trial)
    ## Choosing the dimensionality reduction algorithm //'SpectralEmbedding','TSNE'
    dim_red_obj = instantiate_dim_red(trial)
    ## Choosing the predictor algorithm
    model_obj = instantiate_model(trial)
    
    #Selecting features after feature engineering steps
    numerical_columns = FeatureEngineering().fit_transform(X_train,y_train).select_dtypes(exclude=['category','object']).columns
    categorical_columns = FeatureEngineering().fit_transform(X_train,y_train).select_dtypes(include=['category','object']).columns
    
    ##Creating pipeline
    numerical_pipeline = Pipeline([
        ('imputer_num',KNNImputer(n_neighbors=knn_imp_n_neighbors)),        
        ('feature_remover_num',FeatureRemover(df_columns=numerical_columns,only_necessaries=feature_remover_necessary))
    ])
    categorical_pipeline = Pipeline([
        ('feature_remover_cat',FeatureRemover(df_columns=categorical_columns,only_necessaries=feature_remover_necessary)),
        ('imputer_cat',SimpleImputer(strategy='most_frequent')),        
        ('encoder',encoder_obj)
    ])
    preprocessor = ColumnTransformer([
        ('numerical_pipeline',numerical_pipeline,numerical_columns),
        ('categorical_pipeline',categorical_pipeline,categorical_columns)
    ])

    model_pipeline = Pipeline([
        ('feature_engineering',FeatureEngineering()),
        ('rbi',rule_based_imputation()),
        ('preprocessor',preprocessor),
        ('before_scaler',before_transformer),
        ('scaler',scaler_obj),
        ('dimensionality_reduction',dim_red_obj),
        ('model',model_obj)
    ])

    kf = KFold(n_splits=5, shuffle=True, random_state=42)    
    scores = cross_val_score(model_pipeline, X_train, y_train, scoring='accuracy', cv=kf)
    #print('min:{}\nmax:{}\nall scores:{}'.format(np.min(scores),np.max(scores),scores))    
    return np.min([np.mean(scores), np.median(scores)])

In [18]:
study_functionized = optuna.create_study(direction='maximize')

[I 2024-07-23 13:05:14,635] A new study created in memory with name: no-name-f646d8e8-31f0-4562-a3a8-d38a08df908f


In [19]:
study_functionized.optimize(func=objective_base_functionized,n_trials=400,n_jobs=-1)

Learning rate set to 0.023581
0:	learn: 0.6790582	total: 188ms	remaining: 3m 7s
1:	learn: 0.6673442	total: 204ms	remaining: 1m 42s
2:	learn: 0.6551864	total: 220ms	remaining: 1m 13s
3:	learn: 0.6440742	total: 240ms	remaining: 59.8s
4:	learn: 0.6338326	total: 291ms	remaining: 57.8s
5:	learn: 0.6249449	total: 306ms	remaining: 50.7s
6:	learn: 0.6156294	total: 323ms	remaining: 45.8s
7:	learn: 0.6070332	total: 342ms	remaining: 42.4s
8:	learn: 0.5975209	total: 361ms	remaining: 39.8s
9:	learn: 0.5891430	total: 387ms	remaining: 38.3s
10:	learn: 0.5819098	total: 432ms	remaining: 38.9s
11:	learn: 0.5743518	total: 454ms	remaining: 37.3s
12:	learn: 0.5677772	total: 475ms	remaining: 36s
13:	learn: 0.5615367	total: 488ms	remaining: 34.3s
14:	learn: 0.5550943	total: 535ms	remaining: 35.2s
15:	learn: 0.5493692	total: 561ms	remaining: 34.5s
16:	learn: 0.5439383	total: 586ms	remaining: 33.9s
17:	learn: 0.5382816	total: 609ms	remaining: 33.2s
18:	learn: 0.5337319	total: 645ms	remaining: 33.3s
19:	learn: 

[I 2024-07-23 13:05:43,332] Trial 6 finished with value: 0.7391020724713157 and parameters: {'knn_imp_n_neighbors': 4, 'feature_remover_necessary': False, 'cat_encoder_name': 'OneHotEncoder', 'scaler_name': 'StandardScaler', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'KNeighborsClassifier'}. Best is trial 6 with value: 0.7391020724713157.


486:	learn: 0.4595165	total: 10.8s	remaining: 11.3s
459:	learn: 0.3635271	total: 15s	remaining: 17.6s
487:	learn: 0.4594487	total: 10.8s	remaining: 11.3s
460:	learn: 0.3633843	total: 15s	remaining: 17.5s
488:	learn: 0.4593373	total: 10.8s	remaining: 11.3s
461:	learn: 0.3632024	total: 15s	remaining: 17.5s
489:	learn: 0.4592745	total: 10.8s	remaining: 11.2s
490:	learn: 0.4591870	total: 10.8s	remaining: 11.2s
462:	learn: 0.3631205	total: 15s	remaining: 17.4s
491:	learn: 0.4590375	total: 10.8s	remaining: 11.1s
463:	learn: 0.3629680	total: 15s	remaining: 17.4s
492:	learn: 0.4588812	total: 10.8s	remaining: 11.1s
464:	learn: 0.3628610	total: 15s	remaining: 17.3s
493:	learn: 0.4587770	total: 10.8s	remaining: 11.1s
465:	learn: 0.3627501	total: 15s	remaining: 17.2s
494:	learn: 0.4586672	total: 10.8s	remaining: 11s
495:	learn: 0.4585731	total: 10.8s	remaining: 11s
466:	learn: 0.3625165	total: 15s	remaining: 17.2s
496:	learn: 0.4584911	total: 10.8s	remaining: 11s
467:	learn: 0.3624087	total: 15.1s

[I 2024-07-23 13:05:58,794] Trial 3 finished with value: 0.7686654433489877 and parameters: {'knn_imp_n_neighbors': 3, 'feature_remover_necessary': False, 'cat_encoder_name': 'OneHotEncoder', 'scaler_name': 'RobustScaler', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'GradientBoostingClassifier'}. Best is trial 3 with value: 0.7686654433489877.
[I 2024-07-23 13:06:03,165] Trial 1 finished with value: 0.7682576193214491 and parameters: {'knn_imp_n_neighbors': 6, 'feature_remover_necessary': False, 'cat_encoder_name': 'CountEncoder', 'scaler_name': 'StandardScaler', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'GradientBoostingClassifier'}. Best is trial 3 with value: 0.7686654433489877.


Learning rate set to 0.023581
0:	learn: 0.6793008	total: 63.8ms	remaining: 1m 3s
1:	learn: 0.6655792	total: 92ms	remaining: 45.9s
2:	learn: 0.6543126	total: 129ms	remaining: 42.8s
3:	learn: 0.6419934	total: 164ms	remaining: 40.8s
4:	learn: 0.6319289	total: 182ms	remaining: 36.2s
5:	learn: 0.6222845	total: 201ms	remaining: 33.2s
6:	learn: 0.6134883	total: 254ms	remaining: 36s
7:	learn: 0.6041993	total: 273ms	remaining: 33.8s
8:	learn: 0.5950026	total: 404ms	remaining: 44.5s
9:	learn: 0.5860643	total: 420ms	remaining: 41.6s
10:	learn: 0.5780057	total: 443ms	remaining: 39.9s
11:	learn: 0.5702722	total: 619ms	remaining: 51s
12:	learn: 0.5641120	total: 639ms	remaining: 48.5s
13:	learn: 0.5574975	total: 665ms	remaining: 46.8s
14:	learn: 0.5514519	total: 693ms	remaining: 45.5s
15:	learn: 0.5450161	total: 743ms	remaining: 45.7s
16:	learn: 0.5391167	total: 759ms	remaining: 43.9s
17:	learn: 0.5339107	total: 781ms	remaining: 42.6s
18:	learn: 0.5296942	total: 816ms	remaining: 42.1s
19:	learn: 0.52

[I 2024-07-23 13:06:05,779] Trial 4 finished with value: 0.7767548906789413 and parameters: {'knn_imp_n_neighbors': 6, 'feature_remover_necessary': True, 'cat_encoder_name': 'OrdinalEncoder', 'scaler_name': 'StandardScaler', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'GradientBoostingClassifier'}. Best is trial 4 with value: 0.7767548906789413.


33:	learn: 0.4837734	total: 1.32s	remaining: 37.6s
34:	learn: 0.4822508	total: 1.35s	remaining: 37.4s
35:	learn: 0.4806268	total: 1.41s	remaining: 37.8s
36:	learn: 0.4784537	total: 1.45s	remaining: 37.8s
37:	learn: 0.4763222	total: 1.49s	remaining: 37.7s
38:	learn: 0.4747615	total: 1.55s	remaining: 38.1s
39:	learn: 0.4732201	total: 1.58s	remaining: 38s
40:	learn: 0.4714314	total: 1.6s	remaining: 37.6s
41:	learn: 0.4700810	total: 1.62s	remaining: 37s
42:	learn: 0.4689404	total: 1.65s	remaining: 36.6s
43:	learn: 0.4676076	total: 1.68s	remaining: 36.4s
44:	learn: 0.4659939	total: 1.76s	remaining: 37.3s
45:	learn: 0.4648438	total: 1.78s	remaining: 37s
46:	learn: 0.4636954	total: 1.97s	remaining: 40s
47:	learn: 0.4624208	total: 2.04s	remaining: 40.4s
48:	learn: 0.4613918	total: 2.07s	remaining: 40.1s
49:	learn: 0.4604751	total: 2.13s	remaining: 40.5s
50:	learn: 0.4596686	total: 2.18s	remaining: 40.5s
51:	learn: 0.4585630	total: 2.38s	remaining: 43.4s
52:	learn: 0.4574933	total: 2.46s	remain

[I 2024-07-23 13:06:09,150] Trial 0 finished with value: 0.7274295572167913 and parameters: {'knn_imp_n_neighbors': 5, 'feature_remover_necessary': False, 'cat_encoder_name': 'OrdinalEncoder', 'scaler_name': 'PowerTransformer', 'bTransformer_name': 'bMinMax', 'dim_red_name': 'KernelPCA', 'classifier_name': 'RidgeClassifier'}. Best is trial 4 with value: 0.7767548906789413.


112:	learn: 0.4969522	total: 2.05s	remaining: 16.1s
142:	learn: 0.4147354	total: 4.72s	remaining: 28.3s
113:	learn: 0.4967471	total: 2.06s	remaining: 16s
143:	learn: 0.4144963	total: 4.74s	remaining: 28.2s
114:	learn: 0.4964259	total: 2.1s	remaining: 16.1s
144:	learn: 0.4142326	total: 4.76s	remaining: 28.1s
115:	learn: 0.4962401	total: 2.12s	remaining: 16.2s
145:	learn: 0.4139589	total: 4.8s	remaining: 28.1s
116:	learn: 0.4961159	total: 2.17s	remaining: 16.4s
146:	learn: 0.4136854	total: 4.83s	remaining: 28.1s
117:	learn: 0.4959698	total: 2.18s	remaining: 16.3s
147:	learn: 0.4134335	total: 4.85s	remaining: 27.9s
118:	learn: 0.4958120	total: 2.19s	remaining: 16.2s
148:	learn: 0.4131564	total: 4.86s	remaining: 27.8s
119:	learn: 0.4957014	total: 2.2s	remaining: 16.2s
149:	learn: 0.4128961	total: 4.88s	remaining: 27.6s
120:	learn: 0.4954797	total: 2.22s	remaining: 16.1s
150:	learn: 0.4126620	total: 4.89s	remaining: 27.5s
121:	learn: 0.4953556	total: 2.23s	remaining: 16.1s
151:	learn: 0.412

[I 2024-07-23 13:06:30,582] Trial 5 finished with value: 0.7624533894127214 and parameters: {'knn_imp_n_neighbors': 3, 'feature_remover_necessary': False, 'cat_encoder_name': 'WOEEncoder', 'scaler_name': 'PowerTransformer', 'bTransformer_name': 'passthrough', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'SVC'}. Best is trial 4 with value: 0.7767548906789413.


191:	learn: 0.4852094	total: 4.34s	remaining: 18.4s
214:	learn: 0.3991222	total: 6.96s	remaining: 25.6s
192:	learn: 0.4850906	total: 4.38s	remaining: 18.4s
193:	learn: 0.4849558	total: 4.42s	remaining: 18.4s
215:	learn: 0.3989350	total: 7.01s	remaining: 25.6s
194:	learn: 0.4848134	total: 4.43s	remaining: 18.4s
216:	learn: 0.3987199	total: 7.03s	remaining: 25.5s
195:	learn: 0.4847221	total: 4.45s	remaining: 18.3s
217:	learn: 0.3986111	total: 7.05s	remaining: 25.4s
196:	learn: 0.4845741	total: 4.46s	remaining: 18.3s
218:	learn: 0.3984045	total: 7.07s	remaining: 25.3s
197:	learn: 0.4845057	total: 4.47s	remaining: 18.2s
198:	learn: 0.4843644	total: 4.5s	remaining: 18.2s
219:	learn: 0.3981323	total: 7.12s	remaining: 25.4s
199:	learn: 0.4842721	total: 4.52s	remaining: 18.2s
220:	learn: 0.3980317	total: 7.13s	remaining: 25.3s
200:	learn: 0.4841302	total: 4.62s	remaining: 18.5s
221:	learn: 0.3978798	total: 7.29s	remaining: 25.7s
201:	learn: 0.4840097	total: 4.67s	remaining: 18.6s
222:	learn: 0

[I 2024-07-23 13:07:04,099] Trial 12 finished with value: 0.7715765247410817 and parameters: {'knn_imp_n_neighbors': 2, 'feature_remover_necessary': False, 'cat_encoder_name': 'OneHotEncoder', 'scaler_name': 'RobustScaler', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'LogisticRegression'}. Best is trial 4 with value: 0.7767548906789413.


870:	learn: 0.4185316	total: 17.3s	remaining: 2.56s
750:	learn: 0.3261460	total: 19.8s	remaining: 6.6s
871:	learn: 0.4184701	total: 17.3s	remaining: 2.55s
751:	learn: 0.3260286	total: 19.9s	remaining: 6.57s
872:	learn: 0.4183949	total: 17.3s	remaining: 2.53s
752:	learn: 0.3259450	total: 19.9s	remaining: 6.55s
873:	learn: 0.4183541	total: 17.4s	remaining: 2.51s
753:	learn: 0.3258080	total: 19.9s	remaining: 6.52s
874:	learn: 0.4182937	total: 17.4s	remaining: 2.49s
754:	learn: 0.3257690	total: 20s	remaining: 6.49s
875:	learn: 0.4182061	total: 17.4s	remaining: 2.47s
755:	learn: 0.3256644	total: 20s	remaining: 6.46s
876:	learn: 0.4180840	total: 17.4s	remaining: 2.45s
877:	learn: 0.4179996	total: 17.5s	remaining: 2.43s
756:	learn: 0.3255379	total: 20s	remaining: 6.44s
757:	learn: 0.3254619	total: 20s	remaining: 6.41s
878:	learn: 0.4179155	total: 17.5s	remaining: 2.41s
758:	learn: 0.3253768	total: 20.1s	remaining: 6.38s
879:	learn: 0.4178404	total: 17.5s	remaining: 2.39s
759:	learn: 0.3252873

[I 2024-07-23 13:07:42,573] Trial 9 finished with value: 0.7303589023492065 and parameters: {'knn_imp_n_neighbors': 2, 'feature_remover_necessary': False, 'cat_encoder_name': 'WOEEncoder', 'scaler_name': 'QuantileTransformer', 'dim_red_name': 'KernelPCA', 'classifier_name': 'AdaBoostClassifier'}. Best is trial 4 with value: 0.7767548906789413.


250:	learn: 0.4807739	total: 3.19s	remaining: 9.51s
229:	learn: 0.3985929	total: 5.69s	remaining: 19.1s
251:	learn: 0.4806812	total: 3.2s	remaining: 9.49s
230:	learn: 0.3984052	total: 5.71s	remaining: 19.1s
252:	learn: 0.4806175	total: 3.21s	remaining: 9.49s
231:	learn: 0.3982123	total: 5.74s	remaining: 19.1s
253:	learn: 0.4804944	total: 3.25s	remaining: 9.56s
232:	learn: 0.3980078	total: 5.76s	remaining: 19.1s
254:	learn: 0.4804034	total: 3.27s	remaining: 9.56s
255:	learn: 0.4803061	total: 3.29s	remaining: 9.56s
233:	learn: 0.3978180	total: 5.79s	remaining: 19s
256:	learn: 0.4802264	total: 3.3s	remaining: 9.54s
234:	learn: 0.3977296	total: 5.8s	remaining: 19s
257:	learn: 0.4801558	total: 3.31s	remaining: 9.51s
258:	learn: 0.4800703	total: 3.31s	remaining: 9.48s
235:	learn: 0.3976065	total: 5.81s	remaining: 18.9s
259:	learn: 0.4800233	total: 3.33s	remaining: 9.47s
236:	learn: 0.3974289	total: 5.83s	remaining: 18.8s
260:	learn: 0.4799555	total: 3.33s	remaining: 9.44s
261:	learn: 0.47981

[I 2024-07-23 13:08:23,037] Trial 15 finished with value: 0.7371460655866797 and parameters: {'knn_imp_n_neighbors': 2, 'feature_remover_necessary': False, 'cat_encoder_name': 'OrdinalEncoder', 'scaler_name': 'MinMaxScaler', 'dim_red_name': 'PCA', 'classifier_name': 'RidgeClassifier'}. Best is trial 4 with value: 0.7767548906789413.


687:	learn: 0.3374671	total: 20.4s	remaining: 9.27s
925:	learn: 0.4175546	total: 17.7s	remaining: 1.42s
688:	learn: 0.3373242	total: 20.4s	remaining: 9.24s
926:	learn: 0.4174572	total: 17.8s	remaining: 1.4s
689:	learn: 0.3372538	total: 20.4s	remaining: 9.2s
927:	learn: 0.4173826	total: 17.8s	remaining: 1.38s
690:	learn: 0.3371375	total: 20.4s	remaining: 9.17s
928:	learn: 0.4173261	total: 17.8s	remaining: 1.36s
691:	learn: 0.3370216	total: 20.5s	remaining: 9.13s
929:	learn: 0.4172473	total: 17.8s	remaining: 1.34s
692:	learn: 0.3369379	total: 20.5s	remaining: 9.1s
930:	learn: 0.4171907	total: 17.8s	remaining: 1.32s
693:	learn: 0.3368643	total: 20.5s	remaining: 9.06s
931:	learn: 0.4171334	total: 17.9s	remaining: 1.3s
694:	learn: 0.3367645	total: 20.5s	remaining: 9.04s
932:	learn: 0.4170581	total: 17.9s	remaining: 1.29s
695:	learn: 0.3366930	total: 20.6s	remaining: 9.01s
933:	learn: 0.4169988	total: 17.9s	remaining: 1.27s
934:	learn: 0.4169421	total: 18s	remaining: 1.25s
696:	learn: 0.3366

[I 2024-07-23 13:08:32,917] Trial 14 finished with value: 0.7478439191339811 and parameters: {'knn_imp_n_neighbors': 2, 'feature_remover_necessary': False, 'cat_encoder_name': 'OrdinalEncoder', 'scaler_name': 'RobustScaler', 'dim_red_name': 'KernelPCA', 'classifier_name': 'AdaBoostClassifier'}. Best is trial 4 with value: 0.7767548906789413.


Learning rate set to 0.023583
0:	learn: 0.6797368	total: 13.7ms	remaining: 13.7s
1:	learn: 0.6660351	total: 24.5ms	remaining: 12.2s
2:	learn: 0.6552622	total: 37.6ms	remaining: 12.5s
3:	learn: 0.6434119	total: 52.1ms	remaining: 13s
4:	learn: 0.6337042	total: 67.5ms	remaining: 13.4s
5:	learn: 0.6238568	total: 83.8ms	remaining: 13.9s
6:	learn: 0.6135454	total: 109ms	remaining: 15.4s
7:	learn: 0.6047037	total: 126ms	remaining: 15.6s
8:	learn: 0.5955034	total: 147ms	remaining: 16.2s
9:	learn: 0.5873506	total: 169ms	remaining: 16.8s
10:	learn: 0.5788094	total: 182ms	remaining: 16.3s
11:	learn: 0.5721699	total: 214ms	remaining: 17.6s
12:	learn: 0.5658445	total: 243ms	remaining: 18.4s
13:	learn: 0.5592873	total: 257ms	remaining: 18.1s
14:	learn: 0.5531365	total: 275ms	remaining: 18.1s
15:	learn: 0.5476880	total: 300ms	remaining: 18.4s
16:	learn: 0.5426160	total: 356ms	remaining: 20.6s
17:	learn: 0.5372768	total: 376ms	remaining: 20.5s
18:	learn: 0.5332133	total: 473ms	remaining: 24.4s
19:	lea

[I 2024-07-23 13:09:21,037] Trial 13 finished with value: 0.767400877850649 and parameters: {'knn_imp_n_neighbors': 4, 'feature_remover_necessary': True, 'cat_encoder_name': 'BinaryEncoder', 'scaler_name': 'PowerTransformer', 'bTransformer_name': 'passthrough', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'SVC'}. Best is trial 4 with value: 0.7767548906789413.


112:	learn: 0.4980473	total: 1.72s	remaining: 13.6s
115:	learn: 0.4219087	total: 3.21s	remaining: 24.9s
113:	learn: 0.4979313	total: 1.73s	remaining: 13.6s
114:	learn: 0.4976742	total: 1.74s	remaining: 13.5s
115:	learn: 0.4975680	total: 1.74s	remaining: 13.4s
116:	learn: 0.4215256	total: 3.24s	remaining: 24.9s
116:	learn: 0.4973842	total: 1.75s	remaining: 13.3s
117:	learn: 0.4971858	total: 1.84s	remaining: 13.9s
117:	learn: 0.4211984	total: 3.34s	remaining: 25.4s
118:	learn: 0.4970227	total: 1.87s	remaining: 14s
119:	learn: 0.4968592	total: 1.91s	remaining: 14.1s
118:	learn: 0.4208001	total: 3.43s	remaining: 25.9s
120:	learn: 0.4967657	total: 1.97s	remaining: 14.4s
119:	learn: 0.4205998	total: 3.46s	remaining: 25.8s
121:	learn: 0.4963699	total: 2s	remaining: 14.5s
120:	learn: 0.4203053	total: 3.49s	remaining: 25.8s
122:	learn: 0.4960917	total: 2.01s	remaining: 14.5s
121:	learn: 0.4200619	total: 3.51s	remaining: 25.7s
123:	learn: 0.4959407	total: 2.07s	remaining: 14.7s
122:	learn: 0.419

[I 2024-07-23 13:09:26,857] Trial 16 finished with value: 0.7667095026373238 and parameters: {'knn_imp_n_neighbors': 3, 'feature_remover_necessary': False, 'cat_encoder_name': 'OneHotEncoder', 'scaler_name': 'RobustScaler', 'dim_red_name': 'LinearDiscriminantAnalysis', 'classifier_name': 'XGBoost'}. Best is trial 4 with value: 0.7767548906789413.


391:	learn: 0.4666911	total: 7.32s	remaining: 11.4s
392:	learn: 0.4665914	total: 7.33s	remaining: 11.4s
341:	learn: 0.3797590	total: 8.69s	remaining: 16.8s
393:	learn: 0.4664677	total: 7.36s	remaining: 11.3s
342:	learn: 0.3796918	total: 8.71s	remaining: 16.8s
394:	learn: 0.4664023	total: 7.38s	remaining: 11.3s
343:	learn: 0.3795256	total: 8.73s	remaining: 16.7s
395:	learn: 0.4661955	total: 7.4s	remaining: 11.3s
344:	learn: 0.3794558	total: 8.76s	remaining: 16.7s
396:	learn: 0.4660844	total: 7.42s	remaining: 11.3s
345:	learn: 0.3792956	total: 8.79s	remaining: 16.7s
397:	learn: 0.4659446	total: 7.45s	remaining: 11.3s
346:	learn: 0.3791411	total: 8.8s	remaining: 16.7s
398:	learn: 0.4658235	total: 7.46s	remaining: 11.3s
347:	learn: 0.3790210	total: 8.83s	remaining: 16.6s
399:	learn: 0.4657334	total: 7.48s	remaining: 11.3s
348:	learn: 0.3789137	total: 8.86s	remaining: 16.6s
400:	learn: 0.4655926	total: 7.5s	remaining: 11.2s
349:	learn: 0.3787213	total: 8.87s	remaining: 16.6s
401:	learn: 0.4

[I 2024-07-23 13:09:49,491] Trial 11 finished with value: 0.7274295572167913 and parameters: {'knn_imp_n_neighbors': 3, 'feature_remover_necessary': True, 'cat_encoder_name': 'BinaryEncoder', 'scaler_name': 'QuantileTransformer', 'dim_red_name': 'Isomap', 'classifier_name': 'AdaBoostClassifier'}. Best is trial 4 with value: 0.7767548906789413.


455:	learn: 0.3636708	total: 11.3s	remaining: 13.5s
534:	learn: 0.4521256	total: 9.99s	remaining: 8.71s
456:	learn: 0.3635261	total: 11.3s	remaining: 13.5s
535:	learn: 0.4520313	total: 10s	remaining: 8.7s
457:	learn: 0.3633288	total: 11.3s	remaining: 13.5s
536:	learn: 0.4519508	total: 10s	remaining: 8.68s
458:	learn: 0.3632101	total: 11.3s	remaining: 13.4s
537:	learn: 0.4518459	total: 10s	remaining: 8.66s
459:	learn: 0.3630630	total: 11.3s	remaining: 13.4s
538:	learn: 0.4517856	total: 10.1s	remaining: 8.64s
460:	learn: 0.3628925	total: 11.4s	remaining: 13.4s
539:	learn: 0.4516734	total: 10.1s	remaining: 8.62s
461:	learn: 0.3628016	total: 11.4s	remaining: 13.3s
540:	learn: 0.4516003	total: 10.1s	remaining: 8.59s
462:	learn: 0.3626970	total: 11.4s	remaining: 13.3s
541:	learn: 0.4514705	total: 10.1s	remaining: 8.57s
463:	learn: 0.3625635	total: 11.4s	remaining: 13.3s
542:	learn: 0.4513759	total: 10.1s	remaining: 8.57s
464:	learn: 0.3624593	total: 11.4s	remaining: 13.2s
543:	learn: 0.45130

[I 2024-07-23 13:10:24,561] Trial 8 finished with value: 0.5753739930955121 and parameters: {'knn_imp_n_neighbors': 4, 'feature_remover_necessary': True, 'cat_encoder_name': 'BinaryEncoder', 'scaler_name': 'MinMaxScaler', 'dim_red_name': 'LocallyLinearEmbedding', 'classifier_name': 'RidgeClassifier'}. Best is trial 4 with value: 0.7767548906789413.


Learning rate set to 0.023583
0:	learn: 0.6803799	total: 40.4ms	remaining: 40.4s
1:	learn: 0.6687870	total: 64.5ms	remaining: 32.2s
2:	learn: 0.6578531	total: 90ms	remaining: 29.9s
3:	learn: 0.6459637	total: 126ms	remaining: 31.5s
4:	learn: 0.6358994	total: 182ms	remaining: 36.3s
5:	learn: 0.6260743	total: 367ms	remaining: 1m
Learning rate set to 0.023583
6:	learn: 0.6163125	total: 395ms	remaining: 56s
0:	learn: 0.6823032	total: 20.5ms	remaining: 20.5s
7:	learn: 0.6059382	total: 415ms	remaining: 51.5s
1:	learn: 0.6737304	total: 44.1ms	remaining: 22s
8:	learn: 0.5979085	total: 438ms	remaining: 48.2s
2:	learn: 0.6654045	total: 51.3ms	remaining: 17s
9:	learn: 0.5908914	total: 448ms	remaining: 44.3s
3:	learn: 0.6573444	total: 61.6ms	remaining: 15.3s
4:	learn: 0.6490303	total: 70.5ms	remaining: 14s
10:	learn: 0.5833919	total: 461ms	remaining: 41.5s
5:	learn: 0.6410010	total: 82.5ms	remaining: 13.7s
11:	learn: 0.5757455	total: 474ms	remaining: 39s
6:	learn: 0.6346150	total: 89.7ms	remaining:

[W 2024-07-23 13:11:22,819] Trial 7 failed with parameters: {'knn_imp_n_neighbors': 4, 'feature_remover_necessary': False, 'cat_encoder_name': 'OneHotEncoder', 'scaler_name': 'PowerTransformer', 'bTransformer_name': 'passthrough', 'dim_red_name': 'KernelPCA', 'classifier_name': 'CatBoost'} because of the following error: The value nan is not acceptable.
[W 2024-07-23 13:11:22,822] Trial 7 failed with value nan.


851:	learn: 0.3176028	total: 11.8s	remaining: 2.06s
852:	learn: 0.3174955	total: 11.8s	remaining: 2.04s
853:	learn: 0.3173825	total: 11.8s	remaining: 2.02s
854:	learn: 0.3173192	total: 11.9s	remaining: 2.02s
855:	learn: 0.3172298	total: 11.9s	remaining: 2s
856:	learn: 0.3171158	total: 11.9s	remaining: 1.99s
857:	learn: 0.3170389	total: 11.9s	remaining: 1.98s
858:	learn: 0.3169349	total: 11.9s	remaining: 1.96s
859:	learn: 0.3168210	total: 11.9s	remaining: 1.95s
860:	learn: 0.3167352	total: 11.9s	remaining: 1.93s
861:	learn: 0.3166397	total: 11.9s	remaining: 1.92s
862:	learn: 0.3165460	total: 11.9s	remaining: 1.9s
863:	learn: 0.3164535	total: 11.9s	remaining: 1.89s
864:	learn: 0.3163398	total: 12s	remaining: 1.87s
865:	learn: 0.3162426	total: 12s	remaining: 1.85s
866:	learn: 0.3161580	total: 12s	remaining: 1.84s
867:	learn: 0.3160719	total: 12s	remaining: 1.82s
868:	learn: 0.3159670	total: 12s	remaining: 1.81s
869:	learn: 0.3158648	total: 12s	remaining: 1.79s
870:	learn: 0.3157897	total:

[W 2024-07-23 13:11:26,130] Trial 2 failed with parameters: {'knn_imp_n_neighbors': 4, 'feature_remover_necessary': False, 'cat_encoder_name': 'OneHotEncoder', 'scaler_name': 'PowerTransformer', 'bTransformer_name': 'bMinMax', 'dim_red_name': 'PCA', 'classifier_name': 'CatBoost'} because of the following error: The value nan is not acceptable.
[W 2024-07-23 13:11:26,143] Trial 2 failed with value nan.
[I 2024-07-23 13:11:50,964] Trial 17 finished with value: 0.7286314569104767 and parameters: {'knn_imp_n_neighbors': 6, 'feature_remover_necessary': True, 'cat_encoder_name': 'BinaryEncoder', 'scaler_name': 'StandardScaler', 'dim_red_name': 'Isomap', 'classifier_name': 'NaiveBayes'}. Best is trial 4 with value: 0.7767548906789413.
[I 2024-07-23 13:13:23,160] Trial 10 finished with value: 0.7116034968445419 and parameters: {'knn_imp_n_neighbors': 4, 'feature_remover_necessary': True, 'cat_encoder_name': 'BinaryEncoder', 'scaler_name': 'StandardScaler', 'dim_red_name': 'LocallyLinearEmbeddi

KeyboardInterrupt: 

[I 2024-07-23 13:14:11,335] Trial 24 finished with value: 0.5025876940770557 and parameters: {'knn_imp_n_neighbors': 5, 'feature_remover_necessary': True, 'cat_encoder_name': 'CountEncoder', 'scaler_name': 'RobustScaler', 'dim_red_name': 'LocallyLinearEmbedding', 'classifier_name': 'LogisticRegression'}. Best is trial 4 with value: 0.7767548906789413.
[I 2024-07-23 13:14:12,343] Trial 23 finished with value: 0.5813856752720206 and parameters: {'knn_imp_n_neighbors': 6, 'feature_remover_necessary': True, 'cat_encoder_name': 'CountEncoder', 'scaler_name': 'StandardScaler', 'dim_red_name': 'LocallyLinearEmbedding', 'classifier_name': 'LogisticRegression'}. Best is trial 4 with value: 0.7767548906789413.


In [349]:
study_functionized.trials_dataframe(attrs=('number','value','params','duration')).sort_values(by='value',ascending=False)[:50]

Unnamed: 0,number,value,params_bTransformer_name,params_cat_encoder_name,params_classifier_name,params_dim_red_name,params_knn_imp_n_neighbors,params_scaler_name,duration
3,3,0.749225,,CountEncoder,RandomForestClassifier,Isomap,5,RobustScaler,0 days 00:04:34.782319
8,8,0.739677,,BinaryEncoder,SVC,PCA,4,MinMaxScaler,0 days 00:01:46.607638
2,2,0.737721,,CountEncoder,RidgeClassifier,PCA,3,MinMaxScaler,0 days 00:00:29.235870
1,1,0.703853,,WOEEncoder,RidgeClassifier,Isomap,5,QuantileTransformer,0 days 00:03:47.684164
7,7,0.686528,,CountEncoder,NaiveBayes,PCA,2,RobustScaler,0 days 00:00:31.078818
5,5,0.534678,,CountEncoder,NaiveBayes,LocallyLinearEmbedding,5,MinMaxScaler,0 days 00:03:22.870803
6,6,0.514664,bMinMax,OrdinalEncoder,NaiveBayes,LinearDiscriminantAnalysis,6,PowerTransformer,0 days 00:01:55.598971
0,0,,bMinMax,CountEncoder,RandomForestClassifier,PCA,5,PowerTransformer,0 days 00:01:26.032308
4,4,,,OneHotEncoder,GradientBoostingClassifier,LocallyLinearEmbedding,5,QuantileTransformer,0 days 00:05:48.120568
9,9,,passthrough,WOEEncoder,LogisticRegression,PCA,3,PowerTransformer,0 days 00:01:23.244148


In [None]:
study_functionized.trials_dataframe(attrs=('number','value','params','duration')).sort_values(by='value',ascending=False)[50:100]

In [None]:
optuna.visualization.plot_param_importances(study=study_functionized)

In [None]:
#Number of selection of predictors and transformers in top 100 and top 70 trials
top_100_trials = study_functionized.trials_dataframe(attrs=('number','value','params','duration')).sort_values(by='value',ascending=False)[:100]
top_70_trials = study_functionized.trials_dataframe(attrs=('number','value','params','duration')).sort_values(by='value',ascending=False)[:70]
display(top_100_trials['params_cat_encoder_name'].value_counts(),'---top 70',top_70_trials['params_cat_encoder_name'].value_counts(),'-'*50)
display(top_100_trials['params_classifier_name'].value_counts(),'---top 70',top_70_trials['params_classifier_name'].value_counts(),'-'*50)
display(top_100_trials['params_dim_red_name'].value_counts(),'---top 70',top_70_trials['params_dim_red_name'].value_counts(),'-'*50)
display(top_100_trials['params_knn_imp_n_neighbors'].value_counts(),'---top 70',top_70_trials['params_knn_imp_n_neighbors'].value_counts(),'-'*50)
display(top_100_trials['params_bTransformer_name'].value_counts(),'---top 70',top_70_trials['params_bTransformer_name'].value_counts(),'-'*50)
display(top_100_trials['params_scaler_name'].value_counts(),'---top 70',top_70_trials['params_scaler_name'].value_counts(),'-'*50)

In [None]:
#Number of selection of predictors and transformers in top 50 and top 20 non-duplicated trials
non_duplicated_trials = study_functionized.trials_dataframe(attrs=('value','params')).sort_values(by='value',ascending=False).drop_duplicates()
display(non_duplicated_trials['params_cat_encoder_name'][:50].value_counts(),'---top 20',non_duplicated_trials['params_cat_encoder_name'][:20].value_counts(),'-'*50)
display(non_duplicated_trials['params_classifier_name'][:100].value_counts(),'---top 75',non_duplicated_trials['params_classifier_name'][:75].value_counts(),'-'*50)
display(non_duplicated_trials['params_dim_red_name'][:50].value_counts(),'---top 20',non_duplicated_trials['params_dim_red_name'][:20].value_counts(),'-'*50)
display(non_duplicated_trials['params_knn_imp_n_neighbors'][:50].value_counts(),'---top 20',non_duplicated_trials['params_knn_imp_n_neighbors'][:20].value_counts(),'-'*50)
display(non_duplicated_trials['params_bTransformer_name'][:50].value_counts(),'---top 20',non_duplicated_trials['params_bTransformer_name'][:20].value_counts(),'-'*50)
display(non_duplicated_trials['params_scaler_name'][:50].value_counts(),'---top 20',non_duplicated_trials['params_scaler_name'][:20].value_counts(),'-'*50)

# 

In [24]:
rbi_pipe =Pipeline([
    ('feature_eng',FeatureEngineering()),
    ('rbi',rule_based_imputation()),
    ('feature_remover',FeatureRemover())
])

In [25]:
rbi_pipe.fit_transform(df_train)

Number of NaN before RBI: 4375
Middle Execution time: 0.006994962692260742
(Group,HomePlanet) Exe Time: 3.1642398834228516
(Group,CabinSide) Exe Time: 6.249605178833008
(Cabin,HomePlanet) Exe Time: 15.118814945220947
(LastName,HomePlanet) Exe Time: 18.511082649230957
Number of NaN after RBI: 3246
% 25.805714285714288 (count:1129) of NaN filled with RBI !
Execution Time: 18.607084035873413


Unnamed: 0,HomePlanet,CryoSleep,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,HasFamily,Expense,HasExpense,NumAmenitiesUsed,NecessityExpense,LuxuryExpense,CabinDeck
0,Europa,False,0.0,0.0,0.0,0.0,0.0,False,1,0,0.0,0,0,0.0,0.0,B
1,Earth,False,109.0,9.0,25.0,549.0,44.0,True,2,0,736.0,1,5,143.0,593.0,F
2,Europa,False,43.0,3576.0,0.0,6715.0,49.0,False,3,1,10383.0,1,4,3619.0,6764.0,A
3,Europa,False,0.0,1283.0,371.0,3329.0,193.0,False,3,1,5176.0,1,4,1654.0,3522.0,A
4,Earth,False,303.0,70.0,151.0,565.0,2.0,True,4,0,1091.0,1,5,524.0,567.0,F
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,0.0,6819.0,0.0,1643.0,74.0,False,9276,0,8536.0,1,3,6819.0,1717.0,A
8689,Earth,True,0.0,0.0,0.0,0.0,0.0,False,9278,0,0.0,0,0,0.0,0.0,G
8690,Earth,False,0.0,0.0,1872.0,1.0,0.0,True,9279,0,1873.0,1,2,1872.0,1.0,G
8691,Europa,False,0.0,1049.0,0.0,353.0,3235.0,False,9280,1,4637.0,1,3,1049.0,3588.0,E


# OPTUNA HYPERPARAMETER OPTIMIZATION