In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

In [2]:
X_df, y_series= load_breast_cancer(return_X_y=True, as_frame=True)
X = X_df.values
y = y_series.values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5, test_size=0.2, stratify=y)

Let's first run it without any scaling. We can expect this model to perform worse than the model we will build with scaled data. 

In [3]:
logit = LogisticRegression()
logit.fit(X_train, y_train)
y_pred = logit.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9385964912280702


Now let's scale the input data.

In [4]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))

X_test_scaled = scaler.transform(X_test)
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[-3.19841833e-15 -1.61653352e-15  3.33957526e-15  4.84594050e-16
 -1.03110438e-15 -1.34117381e-15  6.95414422e-18  5.46083325e-16
 -8.18505825e-15  7.55903276e-15 -4.73613822e-16 -1.73121590e-15
  1.32079939e-15 -2.09966354e-16 -2.27656721e-16  5.19730779e-17
  7.61387290e-16 -1.99291133e-15  1.63044181e-15  3.72168718e-16
  1.03470346e-15  1.80685747e-15 -9.61379938e-16  2.04354238e-16
  3.49488448e-15  3.58260430e-16  1.73426597e-16  1.32811954e-15
 -2.73871280e-15 -2.70174603e-16]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]
[-0.09621519  0.06684396 -0.08547477 -0.07798067  0.13537252  0.04897378
 -0.01922362 -0.03745892  0.1375977   0.09793901 -0.07538227  0.03260011
 -0.04514388 -0.04490852 -0.00164389 -0.12694501 -0.15813643 -0.18362065
  0.17556356 -0.13096238 -0.09187105  0.07984449 -0.07114219 -0.08211845
  0.20085737  0.10107909  0.03350835  0.01014935  0.24266178  0.09607921]
[1.03411828 1.15721713 1.04319507 1.03689999 1.15697

In [5]:
new_logit = LogisticRegression()

new_logit.fit(X_train_scaled, y_train)
new_y_pred = new_logit.predict(X_test_scaled)
new_accuracy = accuracy_score(y_test, new_y_pred)

print(new_accuracy)

0.9824561403508771
