<a href="https://colab.research.google.com/github/SangamSilwal/Machine-learning-Series/blob/main/Day_8_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Automatically Selecting value for imputation

In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.linear_model import LogisticRegression

In [14]:
df = pd.read_csv("train.csv")

In [15]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [16]:
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


In [17]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)

In [19]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
30,1,male,40.0,0,0,27.7208,C
10,3,female,4.0,1,1,16.7,S
873,3,male,47.0,0,0,9.0,S
182,3,male,9.0,4,2,31.3875,S
876,3,male,20.0,0,0,9.8458,S


In [20]:
numerical_features = ['Age','Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

categorical_features = ['Embarked','Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])

In [21]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features),
        ('cat',categorical_transformer, categorical_features)
    ]
)

In [22]:
clf = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',LogisticRegression())
])

In [23]:
clf

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
param_grid = {
    'preprocessor__num__imputer__strategy':['mean','median'],
    'preprocessor__cat__imputer__strategy':['most_frequent','constant'],
    'classifier__C':[0.1,1.0,10,100]
}

grid_search = GridSearchCV(clf,param_grid,cv=10)

In [27]:
grid_search.fit(X_train,y_train)

print("Best Params: ")
print(grid_search.best_params_)

Best Params: 
{'classifier__C': 1.0, 'preprocessor__cat__imputer__strategy': 'most_frequent', 'preprocessor__num__imputer__strategy': 'mean'}


In [28]:
print(f"Internal CV Score: {grid_search.best_score_:.3f}")

Internal CV Score: 0.788


In [32]:
cv_result = pd.DataFrame(grid_search.cv_results_)
cv_result = cv_result.sort_values("mean_test_score",ascending=False)

In [33]:
cv_result

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__C,param_preprocessor__cat__imputer__strategy,param_preprocessor__num__imputer__strategy,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
7,0.020494,0.002177,0.009002,0.000835,1.0,constant,median,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
6,0.022118,0.005228,0.009303,0.000825,1.0,constant,mean,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
5,0.021253,0.001897,0.011155,0.002235,1.0,most_frequent,median,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
4,0.019461,0.001621,0.00987,0.001833,1.0,most_frequent,mean,"{'classifier__C': 1.0, 'preprocessor__cat__imp...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
11,0.020595,0.000872,0.009529,0.001125,10.0,constant,median,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
10,0.019649,0.001204,0.008906,0.000262,10.0,constant,mean,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
9,0.022501,0.006504,0.010536,0.002589,10.0,most_frequent,median,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
8,0.019123,0.001861,0.009691,0.001506,10.0,most_frequent,mean,"{'classifier__C': 10, 'preprocessor__cat__impu...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
12,0.020175,0.002729,0.010191,0.003335,100.0,most_frequent,mean,"{'classifier__C': 100, 'preprocessor__cat__imp...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
13,0.020221,0.002066,0.009825,0.001175,100.0,most_frequent,median,"{'classifier__C': 100, 'preprocessor__cat__imp...",0.763889,0.861111,...,0.802817,0.788732,0.760563,0.84507,0.732394,0.760563,0.788732,0.787852,0.037656,1
