<a href="https://www.kaggle.com/code/piyushjain572/select-best-imputer-params?scriptVersionId=200271762" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('/kaggle/input/modified-titanic-dataset/train.csv')
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [3]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(['Survived'],axis=1),
                                                df['Survived'],
                                                test_size=0.2,
                                                random_state=2)
X_train.isnull().sum()

Pclass        0
Sex           0
Age         148
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [9]:
numerical_features = ['Age','Fare']
num_trf = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler())
])

categorical_features = ['Sex','Embarked']
cat_trf = Pipeline(steps=[
    ('impute',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(drop='first'))
])

In [10]:
trf1 = ColumnTransformer(transformers=[
    ('num',num_trf,numerical_features),
    ('cat',cat_trf,categorical_features)
])

In [11]:
pipe = Pipeline(steps=[
    ('preprocessing',trf1),
    ('classifier',LogisticRegression())
])
pipe

In [15]:
params_grid = {
    'preprocessing__num__impute__strategy': ['mean','median'],
    'preprocessing__cat__impute__strategy': ['most_frequent','constant'],
    'classifier__C': [0.1,1,10,100]
}

gridsearch = GridSearchCV(pipe,params_grid,cv=10)

In [16]:
gridsearch.fit(X_train, y_train)

print(f"Best params:")
print(gridsearch.best_params_)

Best params:
{'classifier__C': 1, 'preprocessing__cat__impute__strategy': 'most_frequent', 'preprocessing__num__impute__strategy': 'mean'}


In [18]:
print(f"Internal CV score: {gridsearch.best_score_:.3f}")

Internal CV score: 0.788


In [22]:
import pandas as pd

cv_results = pd.DataFrame(gridsearch.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_classifier__C','param_preprocessing__cat__impute__strategy','param_preprocessing__num__impute__strategy','mean_test_score']]

Unnamed: 0,param_classifier__C,param_preprocessing__cat__impute__strategy,param_preprocessing__num__impute__strategy,mean_test_score
4,1.0,most_frequent,mean,0.787852
5,1.0,most_frequent,median,0.787852
6,1.0,constant,mean,0.787852
7,1.0,constant,median,0.787852
8,10.0,most_frequent,mean,0.787852
9,10.0,most_frequent,median,0.787852
10,10.0,constant,mean,0.787852
11,10.0,constant,median,0.787852
12,100.0,most_frequent,mean,0.787852
13,100.0,most_frequent,median,0.787852
