### Pipelines

In [42]:
import pandas as pd
import numpy as np
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.impute import SimpleImputer
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

### Data

In [2]:
df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [3]:
#convert question mark symbols '?' to NaN
df.replace('?', np.nan, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
#map the target column from string to number
le = LabelEncoder()
df.income = le.fit_transform(df.income)

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


### Train - Test Split

In [6]:
X = df.drop('income', axis=1)
y = df.income

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Building Pipeline

In [7]:
# Extracting columns based on Data Type

num_cols = df.select_dtypes(['int64','float64']).columns.tolist()
cat_cols = df.select_dtypes(['object']).columns.tolist()

one_hot_encoder_cols = [c for c in cat_cols if df[c].nunique() <= 5]
ordinal_encoder_cols = [c for c in cat_cols if df[c].nunique() > 5]

In [8]:
# Extracting categories

one_hot_encoder_cats = [df[c].unique().tolist() for c in one_hot_encoder_cols]
ordinal_encoder_cats = [df[c].unique().tolist() for c in ordinal_encoder_cols]

In [9]:
# Defining Encoders for Categorical Columns

ohe = OneHotEncoder(categories=one_hot_encoder_cats)
oe = OrdinalEncoder(categories=ordinal_encoder_cats)

In [10]:
ohe

OneHotEncoder(categories=[['White', 'Black', 'Asian-Pac-Islander', 'Other',
                           'Amer-Indian-Eskimo'],
                          ['Female', 'Male']])

In [11]:
oe

OrdinalEncoder(categories=[[nan, 'Private', 'State-gov', 'Federal-gov',
                            'Self-emp-not-inc', 'Self-emp-inc', 'Local-gov',
                            'Without-pay', 'Never-worked'],
                           ['HS-grad', 'Some-college', '7th-8th', '10th',
                            'Doctorate', 'Prof-school', 'Bachelors', 'Masters',
                            '11th', 'Assoc-acdm', 'Assoc-voc', '1st-4th',
                            '5th-6th', '12th', '9th', 'Preschool'],
                           ['Widowed', 'Divorced', 'Separated', 'Ne...
                            'Other-relative', 'Husband', 'Wife'],
                           ['United-States', nan, 'Mexico', 'Greece', 'Vietnam',
                            'China', 'Taiwan', 'India', 'Philippines',
                            'Trinadad&Tobago', 'Canada', 'South',
                            'Holand-Netherlands', 'Puerto-Rico', 'Poland',
                            'Iran', 'England', 'Germany', 'Italy'

In [12]:
### Handling Numeric Columns

numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
])

In [13]:
### Handling Categorical Columns - One Hot Encoder

ohe_categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('one_hot', ohe)
])

In [14]:
### Handling Categorical Columns - Ordinal Encoder

oe_categorical_pipe = Pipeline([
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('ordinal', oe)
])

In [15]:
### Defining the Pre-Processing Transformer

preprocessor = ColumnTransformer(transformers= [('numerical', numerical_pipe, num_cols),
    ('ohe', ohe_categorical_pipe, one_hot_encoder_cols),
    ('oe', oe_categorical_pipe, ordinal_encoder_cols)],
    remainder = 'passthrough')

In [16]:
### Building Combined Pipeline

pipe = Pipeline([
    ('column_transformer', preprocessor),
    ('model', KNeighborsClassifier())
])

In [17]:
### Defining a function to fit the pipeline and return evaluation metric

def fit_and_print(input_pipeline,
                 X_train = X_train,
                 y_train = y_train,
                 X_test = X_test,
                 y_test = y_test):
    
    input_pipeline.fit(X_train,y_train)
    test_preds = input_pipeline.predict(X_test)
    
    print("Testing Accuracy : " + str(accuracy_score(y_test,test_preds)))

In [18]:
fit_and_print(pipe)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Testing Accuracy : 0.8217010980830076


### Hyperparameter Tuning for a Single Model

In [22]:
pipe.get_params()

{'memory': None,
 'steps': [('column_transformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('numerical',
                                    Pipeline(steps=[('imputer', SimpleImputer()),
                                                    ('scaler', StandardScaler())]),
                                    ['age', 'fnlwgt', 'education.num',
                                     'capital.gain', 'capital.loss',
                                     'hours.per.week']),
                                   ('ohe',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('one_hot',
                                                     OneHotEncoder(categories=[['White',
                                                                                'Blac...
                                               

In [23]:
parameters = {
    'column_transformer__numerical__imputer__strategy': ['mean', 'median'],
    'column_transformer__numerical__scaler': [StandardScaler(), MinMaxScaler()],
    'model__n_neighbors': [3, 6]
}

In [24]:
my_scorer = make_scorer(accuracy_score, greater_is_better=True)
search = GridSearchCV(pipe, parameters, cv=2, scoring=my_scorer, n_jobs=-1, verbose=2)

In [25]:
search.fit(X_train, y_train)

Fitting 2 folds for each of 8 candidates, totalling 16 fits


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('column_transformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('numerical',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['age',
                                                                          'fnlwgt',
                                                                          'education.num',
                                                                          

In [26]:
# Change pipeline parameters
pipe.set_params(**search.best_params_)

fit_and_print(pipe)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Testing Accuracy : 0.8247720081890936


In [27]:
pipe

Pipeline(steps=[('column_transformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('numerical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'fnlwgt',
                                                   'education.num',
                                                   'capital.gain',
                                                   'capital.loss',
                                                   'hours.per.week']),
                                                 ('ohe',
                                                  Pipeline(steps=[('imputer',
                                  