# Logistic Regression with Scikit-Learn Pipeline & GridSearchCV

In this notebook we show an example on how we can use Logistic Regression with Pipelines and GridSearchCV like any other Scikit-Learn model.

In [None]:
import pathlib
import pickle
import pandas as pd

from matplotlib import pyplot
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

## Dataset

For this example we'll use a simple dataset: [Breast Cancer Wisconsin (Diagnostic) Data Set](https://www.kaggle.com/uciml/breast-cancer-wisconsin-data).

In [None]:
data_path: pathlib.Path = pathlib.Path("../data")
breast_cancer_data_path: pathlib.Path = data_path / "breast-cancer-wisconsin-data.csv"
df = pd.read_csv(breast_cancer_data_path)
X = df.drop(columns=["id", "Unnamed: 32", "diagnosis"])
y = df["diagnosis"].map({'B': 0, 'M': 1})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=8)

## Define the Pipeline and GridSearch

In [None]:
model = LogisticRegression(solver='lbfgs')

pipeline = Pipeline([
    ('standard_scaler', StandardScaler()), 
    ('pca', PCA()), 
    ('model', model)
])

param_grid = {
    'pca__n_components': [5, 10, 15, 20, 25, 30],
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='roc_auc')

In [None]:
%%time

grid.fit(X_train, y_train)

## CV results

Here are the results of the model that gave the best mean score in the k-fold cross-validation

In [None]:
mean_score = grid.cv_results_["mean_test_score"][grid.best_index_]
std_score = grid.cv_results_["std_test_score"][grid.best_index_]

grid.best_params_, mean_score, std_score

print(f"Best parameters: {grid.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")

## Plot Performance

In [None]:
lr_probs = grid.predict_proba(X_test)
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(y_test))]
# calculate scores
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic Regression: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic Regression')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# Persist the model

In [None]:
pickle_path = pathlib.Path("../model_artifacts/")
with open(pickle_path / "logistic-classifier.pkl", "wb") as oh:
    pickle.dump(grid, oh)

## Export Schema

In [None]:
import pandera as pa
print(pa.infer_schema(X_test).to_script())

In [None]:
request_dictionary = X_test.to_dict(orient='list')
import requests
resp = requests.post("http://localhost:8000/predict", json=request_dictionary)
resp.json()