### Hauptkomponentenanalyse (PCA) am Beispiel der Breast-Cancer-Daten
#### 1) Daten laden

In [6]:
import pandas as pd
import numpy as np
#np.set_printoptions(precision=3, suppress=True)

#pd.set_option('display.max_columns', 6)

data_url=r'https://github.com/tplusone/hanser_ml_zeitreihen/blob/master/Daten/breast_cancer_wisconsin.csv?raw=true'
df = pd.read_csv(data_url)
df.head()

Unnamed: 0,id,clump thickness,uniformity cell size,uniformity cell shape,marginal adhesion,epithelial cell size,bare nuclei,bland chromatin,normal nucleoli,mitoses,label
0,1000025,5,1,1,1,2,1.0,3,1,1,benign
1,1002945,5,4,4,5,7,10.0,3,2,1,benign
2,1015425,3,1,1,1,2,2.0,3,1,1,benign
3,1016277,6,8,8,1,3,4.0,3,7,1,benign
4,1017023,4,1,1,3,2,1.0,3,1,1,benign


#### 2) Vorbereitung der Daten
a) Relevante Datenpartitionen extrahieren

In [7]:
X = df.drop(['id', 'label'], axis=1)
y = df['label'].map(lambda x: 1 if x=='malignant' else 0)
X.shape,y.shape

((683, 9), (683,))

b) Train-Test-Split durchführen

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12)

c) Standardisieren der Features

In [9]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
'Mittelwert:', X_train.mean(axis=0), 'Standardabw.:', X_train.std(axis=0)

('Mittelwert:',
 array([ 1.36642834e-16,  1.04108826e-16,  6.50680161e-17,  0.00000000e+00,
         7.32015181e-17, -6.50680161e-17, -3.25340080e-17,  6.50680161e-18,
         4.71743117e-17]),
 'Standardabw.:',
 array([1., 1., 1., 1., 1., 1., 1., 1., 1.]))

#### 3) PCA durchführen

PCA aufsetzen und anlernen

In [10]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.9)
pca.fit(X_train)

Erklärte Varianz

In [14]:
exp_var = pca.explained_variance_ratio_
sum_exp_var = sum(exp_var)
print('explained variance factor 1-6:', exp_var)
print('sum explained variance all factors:{:.3f}'.format(sum_exp_var))

explained variance factor 1-6: [0.6535626  0.08532566 0.06332663 0.05044189 0.04574469 0.03395826]
sum explained variance all factors:0.932


Transformation durchführen

In [12]:
X_train_fact = pca.fit_transform(X_train)
X_test_fact = pca.transform(X_test)
X_train_fact.shape, X_test_fact.shape

((546, 6), (137, 6))

#### 4) Breast-Cancer-Analyse mit extrahierten Dimensionen statt den Originalvariablen durchführen

In [13]:
from sklearn.linear_model import LinearRegression, LogisticRegression

logistic = LogisticRegression()
logistic.fit(X_train_fact, y_train)
accuracy = logistic.score(X_test_fact, y_test)
print('accuracy: {:.3f}'.format(accuracy))

accuracy: 0.956
