In diesem Notebook werden sämtliche Dimensionsreduktionsverfahren auf einen zufällig generierten Datensatz mit 10000 samples und 100 features angewandt. Die samplesize entspricht der in dieser Arbeit verwendeten sample size und die 100 features enspricht dem maximum an features nach der feature selection dieser arbeit.

In [71]:
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
import numpy as np
from sklearn.datasets import make_classification
from constants import *
from sklearn.decomposition import PCA
import time
from sklearn.model_selection import train_test_split
from sklearn.decomposition import KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


In [72]:
def make_dataset(n_rows, n_features, train_size):
    X, y = make_classification(n_samples=n_rows, random_state=RANDOM_STATE, n_features=n_features, n_informative=int(n_features/2), n_classes=3)
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=42)
    return X_train, X_test, y_train, y_test

def run_benchmark(transformer, n_runs):
    start = time.time()

    for i in range(n_runs):
        print(f"run {i}")
        transformer.fit(X_train, y_train)

    end = time.time()
    runtime = end- start
    print(f"runtime: {runtime} per run {runtime/n_runs}")

In [73]:
n_rows=200000
n_features=100
train_size=10000

X_train, X_test, y_train, y_test = make_dataset(n_rows=n_rows, n_features=n_features, train_size=train_size)
n_runs = 10

# Benchmark PCA

In [74]:
transformer = PCA(**PCA_PARAMS)
transformer.set_params(**{"n_components": 2})
run_benchmark(transformer, n_runs)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9
runtime: 0.6361427307128906 per run 0.06361427307128906


# Benchmark Kernel PCA

In [75]:
transformer = KernelPCA(**KPCA_PARAMS)
transformer.set_params(**{"n_components": 2})
run_benchmark(transformer, n_runs)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9
runtime: 30.274785041809082 per run 3.027478504180908


# Benchmark LDA

In [76]:
transformer = LinearDiscriminantAnalysis(**LDA_PARAMS)
transformer.set_params(**{"n_components": 2})
run_benchmark(transformer, n_runs)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9
runtime: 1.523341178894043 per run 0.1523341178894043


# Benchmark K-MEANS

In [77]:
from sklearn.cluster import MiniBatchKMeans

transformer = MiniBatchKMeans(**KMEANS_PARAMS)
transformer.set_params(**{"n_clusters": 2})
run_benchmark(transformer, n_runs)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9
runtime: 0.4961211681365967 per run 0.049612116813659665


# Benchmark UMAP

In [78]:
from umap import UMAP

transformer = UMAP(**UMAP_PARAMS)
transformer.set_params(**{"n_components": 2})
run_benchmark(transformer, n_runs)

run 0
run 1
run 2
run 3
run 4
run 5
run 6
run 7
run 8
run 9
runtime: 139.54862093925476 per run 13.954862093925476


# Benchmark Autoencoder

In [79]:
import keras
from keras import layers, regularizers
import tensorflow as tf
import math

params = AUTOENCODER_PARAMS

# amount features in X
#X_n_features = len(X_train.columns)
X_n_features = n_features

# This is the size of our encoded representations = n new features
encoding_dim = min(10, int(math.sqrt(X_n_features)))

# Encoder
input_layer = keras.Input(shape=(X_n_features,), name="input_layer")
x = layers.Dense(int(X_n_features / 2), activation=params["activation"], name="hidden_encode", activity_regularizer=regularizers.l1(10e-5))(input_layer)
x = layers.Dense(encoding_dim, activation=params["activation"], name="encode_layer", activity_regularizer=regularizers.l1(10e-5))(x)
encoder_model = keras.Model(input_layer, x)

# Decoder layer
x = layers.Dense(int(X_n_features / 2), activation=params["activation"], name="hidden_decode", activity_regularizer=regularizers.l1(10e-5))(x)
x = layers.Dense(X_n_features, activation=params["activation"], name="decode", activity_regularizer=regularizers.l1(10e-5))(x)

# autoencoder model
autoencoder = keras.Model(input_layer, x)
autoencoder.compile(optimizer=params["optimizer"], loss=params["loss"])

# specify how early stopping works
callback_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',  # "loss" -> train loss, "val_loss" -> validation loss
    patience=params["early_stopping_patience"],
    verbose=1,
    restore_best_weights=True
)

start = time.time()

for i in range(n_runs):
    # fit
    autoencoder.fit(
        X_train,
        X_train,
        epochs=params["epochs"],
        batch_size=params["batch_size"],
        shuffle=True,
        validation_split=params["validation_split"],
        # use_multiprocessing=True, # seems to have no effect. afaik tensorflow uses all cores by default on a single pc
        callbacks=[callback_early_stopping],
    )

end = time.time()

runtime = end- start
print(f"runtime: {runtime} per run {runtime/n_runs}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

X, y = make_classification(n_samples=n_rows, random_state=RANDOM_STATE, n_features=n_features, n_informative=int(n_features/2), n_classes=3)
n_rows=200000
n_features=100
train_size=10000

--------

pca:            runtime: 0.6361427307128906 per run 0.06361427307128906
kpca:           runtime: 30.274785041809082 per run 3.027478504180908
lda:            runtime: 1.523341178894043 per run 0.1523341178894043
k-means:        runtime: 0.4961211681365967 per run 0.049612116813659665
umap:           runtime: 139.54862093925476 per run 13.954862093925476
autoencoder:    runtime: 61.51878070831299 per run 6.151878070831299


--------
pca runtime = 1 setzen

pca:            1
kpca:           7.59
lda:            2.39
k-means:        0.78
umap:           219.37
autoencoder:    96.71
