## Imports

In [1]:
# data manulplation
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# metrcs
from sklearn.metrics import f1_score

# general
import os
from typing import List
import warnings
warnings.filterwarnings('ignore')

## Read Data

In [2]:
DATA_DIR = os.path.join("Dataset","2_processed_data","train_iteratoin4.csv")
df = pd.read_csv(DATA_DIR)

In [3]:
df.shape

(891, 33)

In [4]:
df.columns

Index(['Survived', 'Name_Words', 'Name_Length', 'Name_Init_labelencode',
       'Name_Init_ordinalencode', 'Name_Init_capt', 'Name_Init_col',
       'Name_Init_countess', 'Name_Init_don', 'Name_Init_dr',
       'Name_Init_jonkheer', 'Name_Init_lady', 'Name_Init_major',
       'Name_Init_master', 'Name_Init_miss', 'Name_Init_mlle', 'Name_Init_mme',
       'Name_Init_mr', 'Name_Init_mrs', 'Name_Init_ms', 'Name_Init_rev',
       'Name_Init_sir', 'Sex_labelencode', 'Embraked_labelencoded',
       'Embarked_ordinalencode', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Pclass', 'SibSp', 'Parch', 'Age_Power_MinMax', 'Fare_Power'],
      dtype='object')

In [5]:
TARGET_COL = ["Survived"]

## Feature Sets

In [6]:
FEATURES_SET = dict()

### Filter Based Methods

##### Correlated Features

In [7]:
FEATURES_SET["correlated_fe"] = ['Fare_Power',
 'Name_Init_mrs',
 'Name_Init_miss',
 'Name_Init_ordinalencode',
 'Sex_labelencode',
 'Name_Length',
 'Pclass',
 'Name_Init_mr']

##### Less Inter Correlated Features

In [8]:
FEATURES_SET["inter_corr_fe"] = ['Name_Words',
 'Name_Length',
 'Parch',
 'Age_Power_MinMax',
 'Fare_Power',
 'Name_Init_labelencode',
 'Name_Init_ordinalencode',
 'Name_Init_master',
 'Name_Init_miss',
 'Name_Init_mr',
 'Name_Init_mrs',
 'Sex_labelencode',
 'Embraked_labelencoded',
 'Embarked_ordinalencode',
 'Embarked_C',
 'Embarked_S',
 'Pclass']

##### Statistically Significant Features

In [9]:
FEATURES_SET["stats_fe"] = ['Name_Words',
 'Name_Length',
 'Parch',
 'Age_Power_MinMax',
 'Fare_Power',
 'Name_Init_labelencode',
 'Name_Init_ordinalencode',
 'Name_Init_master',
 'Name_Init_miss',
 'Name_Init_mr',
 'Name_Init_mrs',
 'Sex_labelencode',
 'Embraked_labelencoded',
 'Embarked_ordinalencode',
 'Embarked_C',
 'Embarked_S',
 'Pclass']

##### Forward Selection

In [10]:
FEATURES_SET["forward_selection_complex"] = ['Name_Init_ordinalencode', 'Name_Init_capt', 'Name_Init_col',
       'Name_Init_countess', 'Name_Init_don', 'Name_Init_jonkheer',
       'Name_Init_lady', 'Name_Init_major', 'Name_Init_master',
       'Name_Init_mlle', 'Name_Init_mme', 'Name_Init_mr', 'Name_Init_mrs',
       'Name_Init_rev', 'Sex_labelencode', 'SibSp']

In [11]:
FEATURES_SET["forward_selection_simple"] = ['Name_Init_capt', 'Name_Init_col', 'Name_Init_countess',
       'Name_Init_don', 'Name_Init_jonkheer', 'Name_Init_lady',
       'Name_Init_major', 'Name_Init_mlle', 'Name_Init_mme',
       'Name_Init_mr', 'Name_Init_mrs', 'Name_Init_ms', 'Name_Init_rev',
       'Name_Init_sir', 'SibSp', 'Parch']

##### Backward Selection

In [12]:
FEATURES_SET["backward_selection_complex"] = ['Name_Length', 'Name_Init_labelencode', 'Name_Init_ordinalencode',
       'Name_Init_don', 'Name_Init_master', 'Name_Init_miss',
       'Name_Init_mlle', 'Name_Init_mme', 'Name_Init_mr', 'Name_Init_ms',
       'Name_Init_sir', 'Embraked_labelencoded', 'Embarked_Q',
       'Embarked_S', 'Pclass', 'Fare_Power']

In [13]:
FEATURES_SET["backward_selection_simple"] = ['Name_Words', 'Name_Length', 'Name_Init_ordinalencode',
       'Name_Init_master', 'Name_Init_miss', 'Name_Init_mr',
       'Name_Init_mrs', 'Name_Init_rev', 'Sex_labelencode',
       'Embraked_labelencoded', 'Embarked_C', 'Pclass', 'SibSp', 'Parch',
       'Age_Power_MinMax', 'Fare_Power']

##### Recursive Feature Elimination

In [14]:
FEATURES_SET["rfe_selection_complex"] = ['Name_Init_labelencode', 'Name_Init_ordinalencode',
       'Name_Init_master', 'Name_Init_mr', 'Name_Init_rev',
       'Sex_labelencode', 'Embraked_labelencoded',
       'Embarked_ordinalencode', 'Pclass', 'SibSp', 'Age_Power_MinMax',
       'Fare_Power']

In [15]:
FEATURES_SET["rfe_selection_simple"] = ['Name_Words', 'Name_Init_labelencode', 'Name_Init_capt',
       'Name_Init_col', 'Name_Init_countess', 'Name_Init_don',
       'Name_Init_dr', 'Name_Init_jonkheer', 'Name_Init_lady',
       'Name_Init_major', 'Name_Init_master', 'Name_Init_miss',
       'Name_Init_mlle', 'Name_Init_mr', 'Name_Init_mrs', 'Name_Init_ms',
       'Name_Init_rev', 'Name_Init_sir', 'Sex_labelencode',
       'Embraked_labelencoded', 'Embarked_ordinalencode', 'Embarked_C',
       'Embarked_S', 'Pclass', 'SibSp', 'Parch', 'Age_Power_MinMax',
       'Fare_Power']

##### Decision Tree: Important Features

In [16]:
FEATURES_SET["Dtree_fe"] = ['Name_Init_mr', 'Fare_Power', 'Age_Power_MinMax', 
'Name_Length', 'Pclass', 'Name_Init_ordinalencode', 'Name_Words', 'Sex_labelencode', 
'SibSp', 'Embarked_C', 'Parch', 'Embraked_labelencoded', 'Name_Init_mrs', 'Embarked_S', 
'Embarked_ordinalencode']

##### Random Forest: Important Features

In [17]:
FEATURES_SET["Rforest_fe"] = ['Fare_Power', 'Name_Length', 'Age_Power_MinMax', 
'Name_Init_ordinalencode', 'Name_Init_mr', 'Pclass', 'Sex_labelencode', 'SibSp', 
'Name_Words', 'Name_Init_labelencode', 'Parch', 'Name_Init_mrs', 'Name_Init_miss', 
'Embraked_labelencoded', 'Embarked_ordinalencode', 'Embarked_C', 'Embarked_S', 
'Name_Init_master', 'Embarked_Q', 'Name_Init_rev', 'Name_Init_dr', 'Name_Init_don', 
'Name_Init_major', 'Name_Init_col', 'Name_Init_jonkheer', 'Name_Init_capt', 'Name_Init_sir', 
'Name_Init_ms', 'Name_Init_countess', 'Name_Init_mlle', 'Name_Init_mme', 'Name_Init_lady']

##### PCA: Forward Selection

In [18]:
FEATURES_SET["pca_forward_complex"] = ['Name_Init_capt', 'Name_Init_col', 'Name_Init_countess',
       'Name_Init_don', 'Name_Init_dr', 'Name_Init_jonkheer',
       'Name_Init_lady', 'Name_Init_major', 'Name_Init_master',
       'Name_Init_mme', 'Name_Init_mr', 'Name_Init_mrs', 'Name_Init_ms',
       'Name_Init_rev', 'Sex_labelencode', 'SibSp']

In [19]:
FEATURES_SET["pca_forward_simple"] = ['Name_Init_capt', 'Name_Init_col', 'Name_Init_countess',
       'Name_Init_don', 'Name_Init_jonkheer', 'Name_Init_lady',
       'Name_Init_major', 'Name_Init_mlle', 'Name_Init_mme',
       'Name_Init_mr', 'Name_Init_mrs', 'Name_Init_ms', 'Name_Init_rev',
       'Name_Init_sir', 'SibSp', 'Parch']

##### PCA: Backward Selection

In [20]:
FEATURES_SET["pca_backward_complex"] = ['Name_Init_ordinalencode', 'Name_Init_don', 'Name_Init_dr',
       'Name_Init_jonkheer', 'Name_Init_lady', 'Name_Init_major',
       'Name_Init_master', 'Name_Init_mlle', 'Name_Init_mme',
       'Name_Init_ms', 'Name_Init_sir', 'Embarked_S', 'Pclass', 'Parch',
       'Age_Power_MinMax', 'Fare_Power']

In [21]:
FEATURES_SET["pca_backward_simple"] = ['Name_Words', 'Name_Length', 'Name_Init_ordinalencode',
       'Name_Init_master', 'Name_Init_miss', 'Name_Init_mr',
       'Name_Init_mrs', 'Name_Init_rev', 'Sex_labelencode',
       'Embraked_labelencoded', 'Embarked_C', 'Pclass', 'SibSp', 'Parch',
       'Age_Power_MinMax', 'Fare_Power']

##### PCA: RFE Selection

In [22]:
FEATURES_SET["pca_rfe_complex"] = ['Name_Init_labelencode', 'Name_Init_ordinalencode',
       'Name_Init_master', 'Name_Init_mr', 'Name_Init_rev',
       'Sex_labelencode', 'Embraked_labelencoded',
       'Embarked_ordinalencode', 'Pclass', 'SibSp', 'Age_Power_MinMax',
       'Fare_Power']

In [23]:
FEATURES_SET["pca_rfe_simple"] = ['Name_Words', 'Name_Init_labelencode', 'Name_Init_capt',
       'Name_Init_col', 'Name_Init_countess', 'Name_Init_don',
       'Name_Init_dr', 'Name_Init_jonkheer', 'Name_Init_lady',
       'Name_Init_major', 'Name_Init_master', 'Name_Init_miss',
       'Name_Init_mlle', 'Name_Init_mr', 'Name_Init_mrs', 'Name_Init_ms',
       'Name_Init_rev', 'Name_Init_sir', 'Sex_labelencode',
       'Embraked_labelencoded', 'Embarked_ordinalencode', 'Embarked_C',
       'Embarked_S', 'Pclass', 'SibSp', 'Parch', 'Age_Power_MinMax',
       'Fare_Power']

# Model Selection

In [145]:
# linear model
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
# Byes model
from sklearn.naive_bayes import GaussianNB
# support vector machine
from sklearn.svm import SVC
# distance based model
from sklearn.neighbors import KNeighborsClassifier
# tree based
from sklearn.tree import DecisionTreeClassifier
# bagging special case
from sklearn.ensemble import RandomForestClassifier
# ensemble
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt

# logger
import logging

logging.basicConfig(filemode="error.log", 
                    level=logging.ERROR,  # Set the logging level to ERROR
                    format='%(asctime)s - %(levelname)s - %(message)s',  # Log format
                          )
# Create a logger instance
logger = logging.getLogger('my_logger')

In [229]:
# global values
min_value = 0
max_value = 10
def highlight_scores(value):
    # Define blue shades from light to dark (reversed for higher values darker)
    colors = ['#F9FCFE', '#F7FBFD', '#F6F9FB', '#F0F5F9', '#E6E8F0',
          '#D1D9EC', '#A7C1EB', '#7EB9EA', '#4EB1E9', '#1EB1E8',
          '#00C4F0', '#00B0F1', '#00A3F5', '#0091E6', '#0081D6',
          '#0072B5', '#005B82', '#004C6D', '#003F5C', '#003F5C']
    
    # Ensure min_value and max_value are valid
    if min_value >= max_value:
        raise ValueError("min_value must be less than max_value.")

    # Calculate the size of each bucket
    bucket_size = (max_value - min_value) / 20

    # Check if the value is within the range
    if value < min_value or value > max_value:
        return 'background-color: #FFFFFF; color: black'  # Default color for out-of-range values

    # Determine the bucket index based on the value
    bucket_index = int((value - min_value) / bucket_size)
    if bucket_index >= 20:
        bucket_index = 19  # Cap the bucket index to the last color

    # Apply color based on the bucket index
    return f'background-color: {colors[bucket_index]}; color: black'

In [218]:
class Model_Selection_Methods:

    def __init__(self, df:pd.DataFrame, features:List[str], target:List[str], feature_tags:List[str], feature_list:List[List[str]], scoring:str, cv:int, seed:int, shuffle:bool, base_line:float):
        self.df = df
        self.features = features
        self.target = target
        self.scoring = scoring
        self.cv = cv
        self.seed = seed
        self.shuffle = shuffle
        self.X, self.y = self.load_data()
        self.feature_tags = feature_tags
        self.feature_list = feature_list
        self.base_line = base_line

    def load_data(self):
        X = self.df[self.features].values
        y = self.df[self.target].iloc[:,0].values
        return X, y
    
    def get_model(self, model_tag):
        
        if model_tag == "LR":
            return {"LR": ("Logistic Regression", LogisticRegression(solver="liblinear" ,max_iter=2000))}
        
        if model_tag == "RDG":
            return {"RDG": {"Ridge Classifier", RidgeClassifier()}}
        
        if model_tag == "SGD":
            return {"SGD": ("Stocastic Gradient", SGDClassifier())}
        
        if model_tag == "NB":        
            return {"NB": ("Naive Bayes", GaussianNB())}

        if model_tag == "SVM":        
            return {"SVM": ("Support Vector", SVC(kernel='linear'))} # this kernel works with RFE

        if model_tag == "KNN":         
            return {"KNN": ("K-Neighbors", KNeighborsClassifier(n_neighbors=2))}

        if model_tag == "DT":        
            return {"DT": ("Decision Tree", DecisionTreeClassifier())}

        if model_tag == "RF":        
            return {"RF": ("Random Forest", RandomForestClassifier())}

        if model_tag == "GB":        
            return {"GB": ("Gradient Boosting", GradientBoostingClassifier())}

        if model_tag == "AB":        
            return {"AB": ("Ada Boosting", AdaBoostClassifier())}

        if model_tag == "ETC":        
            return {"ETC": ("Extra Tres Boosting", ExtraTreesClassifier())}

        if model_tag == "XGB":        
            return {"XGB": ("Extra Gradient Boosting", XGBClassifier())}

        if model_tag == "XGRFB":        
            return {"XGRFB": ("Extra Random Forest Boosting", XGBRFClassifier())}

        if model_tag == "LGB":        
            return {"LGB": ("Light Boosting",LGBMClassifier(learning_rate=0.01, n_estimators=1000))}

        if model_tag == "CB":        
            return {"CB": ("Cat Boosting", CatBoostClassifier(verbose=0))}


    def evaluate_model(self, model, n_features:List[int]):
        # define the model evaluation procedure
        cv = StratifiedKFold(n_splits=self.cv, random_state=self.seed, shuffle=self.shuffle)
        # evaluate the model: n-jobs=-1 [uses all cores]
        scores = cross_val_score(model, self.X[:,n_features], self.y, scoring=self.scoring, cv=cv, n_jobs=-1)
        return scores
    
    def get_feature_index(self, feature_subset:List[str]):
        features_index = []
        for sub_feature in feature_subset:
            features_index.append(self.features.index(sub_feature))
        # print(features_index)
        # print("\n")
        return features_index
    
    def get_custome_scores(self, scores: List[float]):
        mean_score = np.mean(scores)
        std_score = np.std(scores)

        return mean_score, std_score
    
    def box_plot(self, names, results, model_name:str):
        """ 
        plots box plot for each feature combination
        """
        # plot model performance for comparison
        plt.boxplot(results, labels=names, vert=False, showmeans=True)
        plt.axvline(x=self.base_line, color='r', linestyle='--', linewidth=2, label="Baseline")
        plt.xlabel(f"Score: {self.scoring}")
        plt.ylabel("feature Subsets")
        plt.title(f"Performance of Model: {model_name}")
        plt.legend()
        plt.grid()
        plt.show()

    def model_selection(self, model_tags:List[str], single_model:bool = True, print_status:bool = False):
        """ 
        Objective:

        | feature selection | model 1 | model 2 | model 3 | ... | model 4 |
        -----------------------------------------------------------------
        |   selection 1     |   f1    |   f1    |   f1    | ... |   f1    |
        |   selection 2     |   f1    |   f1    |   f1    | ... |   f1    |
        |   selection 3     |   f1    |   f1    |   f1    | ... |   f1    |

        Parameters:

        feature_subset (List[str]): subset of features

        print_status (bool): print model training status

        Returns:

        model_report (dict): models performance on each subset of features
        """
        # run model evaluation for all sub set of features
        model_report = dict()
        subset_features_list = []
        features_tags_list = []
        model_names = []
        all_scores = []

        if not single_model:
            model_report["Selection_Criteria"] = self.feature_tags

        # testing all features
        for model_tag in model_tags:
            # get model 
            try:
                mean_scores = []
                std_scores = []
                model_name, model = self.get_model(model_tag)[model_tag]
            except Exception as e:
                logger.error("problem with model Loading: %s", e)

            print('='*15,model_name,'='*15)
            
            for feature_tag, feature_subset in zip(self.feature_tags, self.feature_list):
                # subset of features
                print('-'*10,feature_tag,'-'*10)

                try:
                    # evaluating for score on selected subset of features
                    s = self.evaluate_model( model, self.get_feature_index(feature_subset))
                except Exception as e:
                    logger.error("problem with Model Evaluation: %s", e)

                # mean scores
                mean, std = self.get_custome_scores(s)
                
                # print those scores
                if print_status:
                    msg = f"{len(feature_subset)} : Mean: {mean} Std: {std}\n"
                    print(msg)
                
                if single_model:
                    # collecting all scores
                    features_tags_list.append(feature_tag)
                    mean_scores.append(mean)
                    std_scores.append(std)
                    all_scores.append(list(s))
                    subset_features_list.append(feature_subset)
                else:
                    mean_scores.append(mean)

            if single_model:
                # plotting results
                self.box_plot(features_tags_list, all_scores, model_name)
            else:
                model_report[f"{model_name}"] = mean_scores

        if single_model:             
            # collecting all scores
            model_report["Selection_Criteria"] = features_tags_list
            model_report[f"{model_name} (mean:{self.scoring})"] = mean_scores
            model_report[f"{model_name} (std:{self.scoring})"] = std_scores
            model_report["cv_scores"] = all_scores
            model_report["subset_features"] = subset_features_list

        return model_report

## Input Parameters

In [159]:
# parameters
DATA = df.copy()
FEATURES = [i for i in df.columns if i not in TARGET_COL]
SCORING = "f1"
CV = 3
SEED = 0
SHUFFLE = True
BASELINE_SCORE = 0.62200
FEATURES_TAGS = list(FEATURES_SET.keys())
FEATURES_SUBSET_LIST = list(FEATURES_SET.values())

In [160]:
model_tags = ["LR", 
              "RDG", 
              "NB", "SVM", "KNN", "DT", "RF", "SGD", "GB", "AB", "ETC", "XGB", "XGRFB", 
            #   "LGB", 
              "CB"]

In [161]:
model_selection_obj = Model_Selection_Methods(
    DATA,
    FEATURES,
    TARGET_COL,
    FEATURES_TAGS,
    FEATURES_SUBSET_LIST,
    SCORING,
    CV,
    SEED,
    SHUFFLE,
    BASELINE_SCORE
)

## Selecting models

In [162]:
report = model_selection_obj.model_selection(model_tags, False, True)

---------- correlated_fe ----------
8 : Mean: 0.7200147409663091 Std: 0.05332764760879653

---------- inter_corr_fe ----------
17 : Mean: 0.7473088336133045 Std: 0.04317829696310947

---------- stats_fe ----------
17 : Mean: 0.7473088336133045 Std: 0.04317829696310947

---------- forward_selection_complex ----------
16 : Mean: 0.7447458040074432 Std: 0.03583963000510542

---------- forward_selection_simple ----------
16 : Mean: 0.7522194674428958 Std: 0.02696382243843028

---------- backward_selection_complex ----------
16 : Mean: 0.7307863338218997 Std: 0.03330937681480319

---------- backward_selection_simple ----------
16 : Mean: 0.767242649638196 Std: 0.03788910645739794

---------- rfe_selection_complex ----------
12 : Mean: 0.7588395372714135 Std: 0.021335354597793562

---------- rfe_selection_simple ----------
28 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- Dtree_fe ----------
15 : Mean: 0.7568684055322957 Std: 0.03789275061158522

---------- Rforest_fe -----

ERROR:my_logger:problem with Model Evaluation: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got 'Ridge Classifier' instead.
ERROR:my_logger:problem with Model Evaluation: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got 'Ridge Classifier' instead.
ERROR:my_logger:problem with Model Evaluation: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got 'Ridge Classifier' instead.
ERROR:my_logger:problem with Model Evaluation: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got 'Ridge Classifier' instead.
ERROR:my_logger:problem with Model Evaluation: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got 'Ridge Classifier' instead.
ERROR:my_logger:problem with Model Evaluation: The 'estimator' parameter of check_scoring must be an object implementing 'fit'. Got 'Ridge Classifier' instead.
ERROR:my_logger:problem with Model Evalu

16 : Mean: 0.6103239294076368 Std: 0.021854030709798765

---------- pca_backward_simple ----------
16 : Mean: 0.767242649638196 Std: 0.03788910645739794

---------- pca_rfe_complex ----------
12 : Mean: 0.7588395372714135 Std: 0.021335354597793562

---------- pca_rfe_simple ----------
28 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- correlated_fe ----------
8 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- inter_corr_fe ----------
17 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- stats_fe ----------
17 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- forward_selection_complex ----------
16 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- forward_selection_simple ----------
16 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- backward_selection_complex ----------
16 : Mean: 0.7624125757294665 Std: 0.029382572522465667

---------- backward_selection_simple ----------
16 : Mean: 0.76241257

## Reporting Models Scores

In [164]:
models_report = pd.DataFrame(report)

In [230]:
# global values
max_value = np.max(np.array(models_report)[1:,1:])
min_value = max_value - 0.5

In [231]:
# Example usage
models_report.style.applymap(highlight_scores, subset=[col for col in models_report.columns if col not in ["Selection_Criteria"]])

Unnamed: 0,Selection_Criteria,Logistic Regression,RidgeClassifier(),Naive Bayes,Support Vector,K-Neighbors,Decision Tree,Random Forest,Stocastic Gradient,Gradient Boosting,Ada Boosting,Extra Tres Boosting,Extra Gradient Boosting,Extra Random Forest Boosting,Cat Boosting
0,correlated_fe,0.720015,0.762413,0.716931,0.718496,0.618845,0.694078,0.713282,0.716483,0.747308,0.703731,0.72034,0.722896,0.7705,0.742677
1,inter_corr_fe,0.747309,0.762413,0.735525,0.722398,0.590542,0.684253,0.747106,0.682596,0.745557,0.720397,0.712795,0.713925,0.752952,0.751663
2,stats_fe,0.747309,0.762413,0.735525,0.722398,0.590542,0.686594,0.727616,0.457601,0.743764,0.720397,0.724276,0.713925,0.752952,0.751663
3,forward_selection_complex,0.744746,0.762413,0.74703,0.73156,0.665504,0.751021,0.753207,0.689061,0.749716,0.744531,0.754631,0.757503,0.756888,0.749004
4,forward_selection_simple,0.752219,0.762413,0.28449,0.73327,0.617485,0.745312,0.744221,0.67709,0.753564,0.758973,0.750053,0.747718,0.752822,0.750738
5,backward_selection_complex,0.730786,0.762413,0.245608,0.722259,0.610632,0.702165,0.741272,0.646739,0.730583,0.731808,0.714046,0.732066,0.763913,0.747076
6,backward_selection_simple,0.767243,0.762413,0.701028,0.746492,0.622854,0.686082,0.757278,0.739656,0.744559,0.74162,0.723833,0.727611,0.757047,0.758454
7,rfe_selection_complex,0.75884,0.762413,0.562456,0.739551,0.713201,0.692578,0.754819,0.706424,0.750864,0.727233,0.730184,0.735129,0.764671,0.767599
8,rfe_selection_simple,0.762413,0.762413,0.625601,0.745362,0.685187,0.695305,0.744523,0.541099,0.75495,0.738758,0.732163,0.732552,0.761983,0.755923
9,Dtree_fe,0.756868,0.762413,0.737203,0.754263,0.606323,0.679256,0.74884,0.47678,0.745639,0.731315,0.728701,0.733617,0.753492,0.759018


## Final Models

Simple Model - Logistic Regression - `backward_selection_simple` - `0.767243`

Complec Model - Extra Random Forest Boosting - `correlated_fe` - `0.770500`

In [232]:
FEATURES_SET["backward_selection_simple"]

['Name_Words',
 'Name_Length',
 'Name_Init_ordinalencode',
 'Name_Init_master',
 'Name_Init_miss',
 'Name_Init_mr',
 'Name_Init_mrs',
 'Name_Init_rev',
 'Sex_labelencode',
 'Embraked_labelencoded',
 'Embarked_C',
 'Pclass',
 'SibSp',
 'Parch',
 'Age_Power_MinMax',
 'Fare_Power']

In [233]:
FEATURES_SET["correlated_fe"]

['Fare_Power',
 'Name_Init_mrs',
 'Name_Init_miss',
 'Name_Init_ordinalencode',
 'Sex_labelencode',
 'Name_Length',
 'Pclass',
 'Name_Init_mr']