# WS24 DMML 25.11.2024: Pipeline (sklearn)<a class="jp-toc-ignore"></a>

# Einführung

https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html<br>
https://scikit-learn.org/1.5/glossary.html#term-predictor<br>
https://scikit-learn.org/1.5/glossary.html#term-transformer

Begriffsklärungen:<br>
<b>Transformer: </b>Transformiert einen Input (normalerweise nur X) - bspw. standardscaler, PCA, ...<br>
<b>Predictor: </b>Kann aus Inputdaten (X) Schätzungen von y erzeugen - bspw. classifier, regressor, outlier detector and clusterer.

<b>Eigenschaften einer Pipeline</b><br>

<ul>
<li>Mit einer Pipeline kann man mit Transformern die Inputdaten X transformieren (preprocessing) und (optional) danach einen Predictor einsetzen.
<li>Eine Pipeline kann wie jeder andere Predictor verwendet werden.
<li>Verhindert, dass Test- und Trainingsdaten vermischt werden (es wird nur mit Training trainiert).
<li>Nutzen: die verschiedenen Schritte können zusammen kreuzvalidiert werden, zu jedem Schritt können verschiedene Parameter gesetzt werden.
<li>Parametersetzung für verschiedene Schritte: Name des Schrittes gefolgt von "__".
<li>Ein Transformer kann mit dem Parameter 'passthrough' übersprungen werden.
</li>
<ul>

# Pipeline mit Parkinson-Daten

<b>Code adapted from</b><br>
https://www.kaggle.com/code/annatshngryan/pipeline-scaling-pca-logistic-regression<br>
and<br>
https://scikit-learn.org/stable/auto_examples/compose/plot_digits_pipe.html#sphx-glr-download-auto-examples-compose-plot-digits-pipe-py

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.decomposition   import PCA
from sklearn.linear_model    import LogisticRegression
from sklearn.metrics         import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV, ParameterGrid, RepeatedKFold, train_test_split
from sklearn.pipeline        import Pipeline
from sklearn.preprocessing   import StandardScaler
from sklearn.svm             import SVC

## Prepare Data

In [None]:
df       = pd.read_csv('pd_speech_features.csv',       skiprows = 1, usecols=lambda x: x != 'id')
df_train = pd.read_csv('pd_speech_features-train.csv', skiprows = 1, usecols=lambda x: x != 'id')
df_test  = pd.read_csv('pd_speech_features-test.csv',  skiprows = 1, usecols=lambda x: x != 'id')

In [None]:
df       = df.rename(columns={'class': 'target'})
df_train = df_train.rename(columns={'class': 'target'})
df_test  = df_test.rename(columns={'class': 'target'})

In [None]:
X       = df.drop('target', axis=1)
X_train = df_train.drop('target', axis=1)
X_test  = df_test.drop('target', axis=1)

y       = df.target
y_train = df_train.target
y_test  = df_test.target

## Pipeline (without Gridsearch)

<b>Define Pipeline</b>

In [None]:
steps = [('scaler', StandardScaler()),
         ('pca', PCA(n_components=.95)),
         ('clf', LogisticRegression(max_iter=1000))]

pipe = Pipeline(steps)

<b>Fit Pipeline</b>

In [None]:
pipe.fit(X_train, y_train)

<b>Check Performance</b>

In [None]:
y_test_pred = pipe.predict(X_test)
print('Confusion Matrix(test):')
print(confusion_matrix(y_test, y_test_pred))   # sklearn.metrics.confusion_matrix(y_true, y_pred,....
print('Accuracy(test):', round(accuracy_score(y_test, y_test_pred)*100),'%')

## GridSearch + Pipeline

<b>Definition Pipeline</b>

In [None]:
steps = [('scaler', StandardScaler()),
         ('pca', PCA(n_components=.95)),
         ('LogReg', LogisticRegression(max_iter=1000))]

pipe = Pipeline(steps)

<b>Definition Grid</b>

In [None]:
# define Parameter: name of step followed by '__'
param_grid = {
    'pca__n_components': [15, 30, 45, 60],
    'LogReg__C'        : [0.001, 1, 10, 100]  }

gridpoints = len(ParameterGrid(param_grid))

print('Anzahl Parameterkombinationen: ', gridpoints)

<b>Art der Kreuzvalidierung (hier: RepeatedKFold) wird für Gridsearch definiert</b>

In [None]:
n_splits  = 5
n_repeats = 10
splits    = n_splits * n_repeats   # Anzahl Durhläufe pro Parameterkombination

cv = RepeatedKFold(n_splits= n_splits, n_repeats= n_repeats)

<b>Fit Gridsearch</b>

In [None]:
# https://scikit-learn.org/1.5/modules/generated/sklearn.model_selection.GridSearchCV.html

import time
start = time.time()

search = GridSearchCV(pipe, param_grid, n_jobs= -1, scoring= 'accuracy', cv= cv, refit= True)
search.fit(X_train, y_train)

print('Anzahl Parameterkombis:', gridpoints)
print('Anzahl Training-Validation-Splits:', splits)
print('Anzahl Durchläufe:', gridpoints*splits)
end = time.time()
print('Dauer:', round(end - start), 'Sekunden')

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

<b>Check Performance</b>

In [None]:
y_test_pred = search.best_estimator_.predict(X_test)
print('Confusion Matrix(test):')
print(confusion_matrix(y_test, y_test_pred))   # sklearn.metrics.confusion_matrix(y_true, y_pred,....
print('Accuracy(test):', round(accuracy_score(y_test, y_test_pred)*100),'%')

<b>Detailanalyse Gridsearch</b>

In [None]:
search.cv_results_

<b>Parameterkombinationen als Beschriftung der x-Achse</b>

In [None]:
labels = []
for i in range(0,gridpoints):
    components = str(search.cv_results_['params'][i]['pca__n_components'] )
    c_values   = str(round(search.cv_results_['params'][i]['LogReg__C'],4))
    labels.append( components + ' / ' + c_values )
labels

In [None]:
# Erzeugen einer Liste mit dem Namen der einzelnen Splits
split_names = []
for i in range(0,splits):
    split_names.append('split' + str(i) + '_test_score')
split_names[0:3]

In [None]:
# Erzeugen einer Liste von Listen zur Sammlung der Accuracy pro Durchlauf
# Äußere Liste geht über die Splits , die inner über die Gridpoints
print('Splits (außen):',splits,', Gridpoints (innen):',gridpoints)
acc = [  [0]*gridpoints ]  *(splits+3)
print(acc[0:5])

In [None]:
# Die Liste acc wird mit den Accuracy pro Durchlauf befüllt
# Zusätzlich werden median, mean und std für jeden Gridpoint berechnet
i == 0
for i in range(0,splits):
    acc[i] = list(search.cv_results_[split_names[i]])
acc[i+1] = np.median(acc[0:splits], axis = 0)                # Get median of array cols
acc[i+2] = np.mean(acc[0:splits],   axis = 0)                # Get mean of array cols
acc[i+3] = np.std(acc[0:splits],    axis = 0)                # Get std of array cols

acc

In [None]:
alpha=0.3
color = 'black'

plt.figure(figsize=(10,6))
title = 'Accuracy in Abhängigkeit der Hyperparameter-Kombinationen (splits:' + str(splits) + ')'
plt.title(title)
plt.xlabel('PCA-Components / C_values')
plt.ylabel('Accuracy')
for i in range(0,splits):
    plt.plot(labels, acc[i],'o', color= 'lightgrey', alpha=alpha)
plt.plot(labels,acc[i+1],'-o', color= 'red', alpha=1, label='median')
plt.plot(labels,acc[i+2],'-o', color= 'blue', alpha=1, label='mean')
plt.errorbar(labels, acc[i+2] , acc[i+3], linestyle='None', color='black', linewidth= 2, marker='', capsize=10, label='mean+-std')
plt.xticks(rotation=90)
plt.ylim(0.7,1)
plt.legend()
plt.grid();

## GridSearch + Pipeline mit/ohne PCA

<b>Definition Pipeline</b>

In [None]:
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('LogReg', LogisticRegression(max_iter=1000))]

pipe = Pipeline(steps)

<b>Definition Grid</b>

In [None]:
param_grid = [
    {   'pca__n_components': [1, 15, 30, 45, 60, 75, 100],
        'LogReg__C'        : [0.001, 1, 10, 100]  }
    ,
    {   'pca'              : ['passthrough'],       # skip PCA
        'LogReg__C'        : [0.001, 1, 10, 100]  }        ]


In [None]:
paras_names  = []  # collects the names of the paramenters for single dictionaries
paras_number = []  # collects the number of the paramenters for single dictionaries
gridpoints   = []  # collects the numer of combinations for single dictionaries

grid_dicts = len(param_grid)

for i in range(0,grid_dicts):
    paras_names.append(list(param_grid[i].keys()))
    paras_number.append(len(list(param_grid[i].keys())))
    gridpoints.append(len(ParameterGrid(param_grid[i])))  
print('Namen der Parameter:', paras_names)
print('Anzahl Parameter:', paras_number)
print('Anzahl Gridpoints:', gridpoints)

<b>Kreuzvalidierung für Gridsearch wird definiert</b>

In [None]:
n_splits  = 5
n_repeats = 4
splits    = n_splits * n_repeats   # Anzahl Splits pro Parameterkombination

cv = RepeatedKFold(n_splits= n_splits, n_repeats= n_repeats)

<b>Fit Gridsearch</b>

In [None]:
search = GridSearchCV(pipe, param_grid, n_jobs = 2, scoring = 'accuracy', cv = cv)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
import time
start = time.time()

search = GridSearchCV(pipe, param_grid, n_jobs= -1, scoring= 'accuracy', cv= cv, refit= True)
search.fit(X_train, y_train)

print('Anzahl Parameterkombis:', sum(gridpoints))
print('Anzahl Training-Validation-Splits:', splits)
print('Anzahl Durchläufe:', sum(gridpoints)*splits)
end = time.time()
print('Dauer:', round(end - start), 'Sekunden')

print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
search.cv_results_

<b>Parameterkombinationen als Beschriftung der x-Achse</b>

In [None]:
labels = []
row    = 0

for i in range(0,grid_dicts):
    for j in range(0,gridpoints[i]):
        temp = ''
        for para in paras_names[i]:
            temp = temp + str(search.cv_results_['params'][row][para] ) + ' / '
        temp = temp[:-3]
        row += 1
        labels.append(temp)
labels

In [None]:
split_names = []
for i in range(0,splits):
    split_names.append('split' + str(i) + '_test_score')
split_names[0:3]

In [None]:
# Erzeugen einer Liste von Listen zur Sammlung der Accuracy pro Durchlauf
# Äußere Liste geht über die Splits , die inner über die Gridpoints
print('Splits (außen):',splits,', Gridpoints (innen):',sum(gridpoints))
acc = [  [0]*sum(gridpoints) ]  *(splits+3)
print(acc[0:5])

In [None]:
# Die Liste acc wird mit den Accuracy pro Durchlauf befüllt
# Zusätzlich werden median, mean und std für jeden Gridpoint berechnet
i == 0
for i in range(0,splits):
    acc[i] = list(search.cv_results_[split_names[i]])
acc[i+1] = np.median(acc[0:splits], axis = 0)                # Get median of array cols
acc[i+2] = np.mean(acc[0:splits],   axis = 0)                # Get mean of array cols
acc[i+3] = np.std(acc[0:splits],    axis = 0)                # Get std of array cols

In [None]:
alpha=0.3
color = 'black'
colormedian = 'red'
colormean   = 'blue'

plt.figure(figsize=(10,6))
title = 'Accuracy in Abhängigkeit der Hyperparameter-Kombinationen (splits:' + str(splits) + ')'
plt.title(title)
plt.xlabel('PCA-Components / C_values')
plt.ylabel('Accuracy')
for i in range(0,splits):
    plt.plot(labels, acc[i],'o', color= 'lightgrey', alpha=alpha)
plt.plot(labels,acc[i+1],'-o', color= colormedian, alpha=1, label='median')
plt.plot(labels,acc[i+2],'-o', color= colormean, alpha=1, label='mean')
plt.errorbar(labels, acc[i+2] , acc[i+3], linestyle='None', color='black', linewidth= 2, marker='', capsize=6, label='mean+-std')
plt.xticks(rotation=90)
plt.ylim(0.7,1)
plt.legend()
plt.grid();

## GridSearch + Pipeline verschiedene Klassifizierer

<b>Definition Pipeline</b>

In [None]:
steps = [('scaler', StandardScaler()),
         ('pca', PCA()),
         ('clf', LogisticRegression(max_iter=1000))]    # LogisticRegression ist hier nur Platzhalter

pipe = Pipeline(steps)

<b>Definition Grid</b>

In [None]:
param_grid = [
    {   'pca__n_components': [1, 15, 30, 45, 60, 75, 100],
        'clf'              : [LogisticRegression(max_iter=1000)] ,
        'clf__C'           : [0.001, 1, 10, 100]  }     
    ,
    {   'pca__n_components': [1, 15, 30, 45, 60, 75, 100],
        'clf'              : [SVC()]                }        ]

In [None]:
paras_names  = []  # collects the names of the paramenters for single dictionaries
paras_number = []  # collects the number of the paramenters for single dictionaries
gridpoints   = []  # collects the numer of combinations for single dictionaries

grid_dicts = len(param_grid)

for i in range(0,grid_dicts):
    paras_names.append(list(param_grid[i].keys()))
    paras_number.append(len(param_grid[i].keys()))
    gridpoints.append(len(ParameterGrid(param_grid[i])))

print('Namen der Parameter:', paras_names)
print('Anzahl Parameter:', paras_number)
print('Anzahl Gridpoints:', gridpoints)

<b>Kreuzvalidierung für GridSearch wird definiert</b>

In [None]:
n_splits  = 5
n_repeats = 4
splits    = n_splits * n_repeats   # Anzahl Splits pro Parameterkombination

cv = RepeatedKFold(n_splits= n_splits, n_repeats= n_repeats)

<b>Fit GridSearch</b>

In [None]:
search = GridSearchCV(pipe, param_grid, n_jobs=- 1, scoring = 'accuracy', cv= cv)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

In [None]:
search.cv_results_

<b>Parameterkombinationen als Beschriftung der x-Achse</b>

In [None]:
labels = []
row    = 0

for i in range(0,grid_dicts):
    for j in range(0,gridpoints[i]):
        temp = ''
        for para in paras_names[i]:
            temp = temp + str(search.cv_results_['params'][row][para] ) + ' / '
        temp = temp[:-3]
        row += 1
        labels.append(temp)
labels

In [None]:
split_names = []
for i in range(0,splits):
    split_names.append('split' + str(i) + '_test_score')
split_names[0:3]

In [None]:
# Erzeugen einer Liste von Listen zur Sammlung der Accuracy pro Durchlauf
# Äußere Liste geht über die Splits , die inner über die Gridpoints
print('Splits (außen):',splits,', Gridpoints (innen):',sum(gridpoints))
acc = [  [0]*sum(gridpoints) ]  *(splits+3)
print(acc[0:5])

In [None]:
# Die Liste acc wird mit den Accuracy pro Durchlauf befüllt
# Zusätzlich werden median, mean und std für jeden Gridpoint berechnet
i == 0
for i in range(0,splits):
    acc[i] = list(search.cv_results_[split_names[i]])
acc[i+1] = np.median(acc[0:splits], axis = 0)                # Get median of array cols
acc[i+2] = np.mean(acc[0:splits],   axis = 0)                # Get mean of array cols
acc[i+3] = np.std(acc[0:splits],    axis = 0)                # Get std of array cols

In [None]:
alpha=0.3
color = 'black'
colormedian = 'red'
colormean   = 'blue'

plt.figure(figsize=(10,6))
title = 'Accuracy in Abhängigkeit der Hyperparameter-Kombinationen (splits:' + str(splits) + ')'
plt.title(title)
plt.xlabel('PCA-Components / C_values')
plt.ylabel('Accuracy')
for i in range(0,splits):
    plt.plot(labels, acc[i],'o', color= 'lightgrey', alpha=alpha)
plt.plot(labels,acc[i+1],'-o', color= colormedian, alpha=1, label='median')
plt.plot(labels,acc[i+2],'-o', color= colormean, alpha=1, label='mean')
plt.errorbar(labels, acc[i+2] , acc[i+3], linestyle='None', color='black', linewidth= 2, marker='', capsize=6, label='mean+-std')
plt.xticks(rotation=90)
plt.ylim(0.7,1)
plt.legend()
plt.grid();

# Beenden Aufzeichnen nicht vergessen