In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

raw = pd.read_table('australian.dat', delim_whitespace=True, names=['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15'])
raw.head(2)

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
0,1,22.08,11.46,2,4,4,1.585,0,0,0,1,2,100,1213,0
1,0,22.67,7.0,2,8,4,0.165,0,0,0,0,2,160,1,0


In [3]:
X = raw.drop(columns = 'A15')
y = raw['A15']

### Use ColumnTransformer by selecting column by names

In [4]:
# Define which columns should be encoded vs scaled
columns_to_encode = ['A1', 'A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12']
columns_to_scale  = ['A2', 'A3', 'A7', 'A10', 'A13', 'A14']

numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, columns_to_scale),
        ("cat", categorical_transformer, columns_to_encode),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
print(f"model score: {clf.score(X_test, y_test):.3f}")

model score: 0.891


In [5]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90        73
           1       0.92      0.85      0.88        65

    accuracy                           0.89       138
   macro avg       0.89      0.89      0.89       138
weighted avg       0.89      0.89      0.89       138



In [9]:
from sklearn import set_config

set_config(display="diagram")
clf

#### The simpleimputer is not really necessary, we can just skip it:

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), columns_to_scale),
        ("cat", OneHotEncoder(handle_unknown="ignore"), columns_to_encode),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
print(f"model score: {clf.score(X_test, y_test):.3f}")

model score: 0.855


In [7]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88        87
           1       0.79      0.82      0.81        51

    accuracy                           0.86       138
   macro avg       0.84      0.85      0.85       138
weighted avg       0.86      0.86      0.86       138



In [10]:
from sklearn import set_config

set_config(display="diagram")
clf

#### Looks like Simpleimputer does something behind the scenes as these resultsa re different from the previous one

### Using the prediction pipeline in a grid search

In [12]:
param_grid = {"classifier__C": [0.1, 1.0, 10, 100]}

grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search

In [14]:
import warnings
warnings.filterwarnings('ignore')

grid_search.fit(X_train, y_train)

print("Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__C': 0.1}


In [15]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.852


#### We can also introspect the top grid search results as a pandas dataframe:

In [17]:
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[
    [
        "mean_test_score",
        "std_test_score",
        "param_classifier__C",
    ]
].head(5)

Unnamed: 0,mean_test_score,std_test_score,param_classifier__C
0,0.851526,0.052167,0.1
1,0.846104,0.05358,1.0
2,0.8425,0.053432,10.0
3,0.837078,0.047485,100.0
