In [28]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# load dataset
dataset = load_breast_cancer()
print(dataset.keys())

# check target distribution
print(pd.Series(dataset.target).value_counts())

# Convert to dataframe for inspection
X = pd.DataFrame(dataset["data"], columns=dataset["feature_names"])
y = pd.Series(dataset["target"])

# Split data into training and test set, stratify: "Keep the same class ratio in both the train and test sets as in the original y."
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)
# print(y_train.value_counts(normalize=True))
# print(y_test.value_counts(normalize=True))

# Preprocess (standardize) features
# only train on training data, not test data, but scale both afterwards!
scaler = preprocessing.StandardScaler().fit(X_train) # fit scaler to feature data (only training set!)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
np.allclose(X_train_scaled.mean(axis=0), 0, atol=1e-8) # returns True, if means are close to 0 and should do so (floating point precision leads to values != 0)
np.allclose(X_test_scaled.mean(axis=0), 0, atol=1e-8) # as above, but should return False, because it was not used for fitting!
X_train_scaled.std(axis=0) # should be 1
X_test_scaled.std(axis=0) # not 1, as scaler has not been fitted on this data

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
1    357
0    212
Name: count, dtype: int64


array([1.03294198, 0.87567807, 1.02884543, 1.09510867, 1.20758302,
       0.93233448, 0.8701484 , 0.98319138, 0.8732951 , 0.92994167,
       1.16256232, 0.81285813, 1.11370043, 1.38554606, 0.76442912,
       0.83934132, 0.58549243, 0.89815747, 0.66091627, 0.72955563,
       1.06208533, 0.93566972, 1.05746064, 1.13782566, 1.06203108,
       0.91894755, 0.81332145, 0.91868313, 0.76305558, 0.90333135])