# Numerical Data

## Imputing numerical data

In [None]:
df['Age_99'] = df['Age'].fillna(99)
df['Age_minus1'] = df['Age'].fillna(-1)

---> basically by adding 99/-1 we tell our model that these cells had missing data and it takes care of that data differently 

### Using Sklearn

In [None]:
imputer1 = SimpleImputer(strategy='constant',fill_value=99)
imputer2 = SimpleImputer(strategy='constant',fill_value=999)

In [None]:
trf = ColumnTransformer([
    ('imputer1',imputer1,['Age']),
    ('imputer2',imputer2,['Fare'])
],remainder='passthrough')

In [None]:
trf.fit(df)
trf.named_transformers_['imputer1'].statistics_
trf.named_transformers_['imputer2'].statistics_

## Mean-Median_imputation

In [None]:
mean_age = df['Age'].mean()
median_age = df['Age'].median()

In [None]:
df['Age_median'] = df['Age'].fillna(median_age)
df['Age_mean'] = df['Age'].fillna(mean_age)

### Using Sklearn

In [None]:
imputer1 = SimpleImputer(strategy='median')
imputer2 = SimpleImputer(strategy='mean')

In [None]:
trf = ColumnTransformer([
    ('imputer1',imputer1,['Age']),
    ('imputer2',imputer2,['Fare'])
],remainder='passthrough')

In [None]:
trf.fit(X_train)
trf.named_transformers_['imputer1'].statistics_
trf.named_transformers_['imputer2'].statistics_

In [None]:
X_train = trf.transform(X_train)
X_test = trf.transform(X_test)

# Categorical Data

## frequent-value-imputation

In [None]:
df = df[df['column_name'].isnull() == df['column_name'].mode()]

### Using Sklearn

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='most_frequent')

In [None]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_train)

## missing-category-imputation

In [None]:
df['column_name'].fillna('Missing', inplace=True)

### Using Sklearn

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='constant',fill_value='Missing')

In [None]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_train)

## automatically-select-imputer-parameters

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression

In [None]:
numerical_features = ['Age', 'Fare']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Embarked', 'Sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe',OneHotEncoder(handle_unknown='ignore'))
])

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [None]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

In [None]:
from sklearn import set_config

set_config(display='diagram')
clf

In [None]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
    'classifier__C': [0.1, 1.0, 10, 100]
}

grid_search = GridSearchCV(clf, param_grid, cv=10)

In [None]:
grid_search.fit(X_train, y_train)

print(f"Best params:")
print(grid_search.best_params_)

In [None]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

In [None]:
import pandas as pd

cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[['param_classifier__C','param_preprocessor__cat__imputer__strategy','param_preprocessor__num__imputer__strategy','mean_test_score']]

## random-sample-imputation

In [None]:
X_train['column_name_imputed'][X_train['column_name_imputed'].isnull()] = X_train['column_name'].dropna().sample(X_train['column_name'].isnull().sum()).values
X_test['column_name_imputed'][X_test['column_name_imputed'].isnull()] = X_train['column_name'].dropna().sample(X_test['column_name'].isnull().sum()).values

## knn-imputer


In [None]:
from sklearn.impute import KNNImputer,SimpleImputer

In [None]:
knn = KNNImputer(n_neighbors=3,weights='distance')

X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [None]:
# Comparision with Simple Imputer --> mean

si = SimpleImputer()

X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

## Iterative imputer

In [None]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [None]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

In [None]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]

y = df1.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))

In [None]:
df1.iloc[1,0] = value_obtained_above

Repeate the steps for other values which were replaced by mean !!

In [None]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Do the process untill the difference becomes close to zero