# Import everything

In [12]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
input_file_path = "./../../timeseries/combined/combined_timeseries.csv"
df = pd.read_csv(input_file_path)

## Deal with nan-values

In [13]:
imputer = SimpleImputer(strategy='mean')  # oder eine andere geeignete Strategie
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [14]:
df_imputed.head(10)

Unnamed: 0,Timestamp,Requests je Sekunde,Durchschnittliche Antwortzeitintervalle,three_i_o_read,three_i_o_write,three_memory,system-pod-1,system-pod-2,system-pod-3,system-pod-4,...,"cart_pod_restart_{container=""redis"", instance=""kube","cart_pod_restart_{container=""server"", instance=""kube","cart_pod_restart_{container=""server"", instance=""kube.2","cart_pod_restart_{container=""server"", instance=""kube.3","cart_pod_restart_{container=""server"", instance=""kube.4","cart_pod_restart_{container=""server"", instance=""kube.5","cart_pod_restart_{container=""server"", instance=""kube.6","cart_pod_restart_{container=""server"", instance=""kube.7","cart_pod_restart_{container=""server"", instance=""kube.8","cart_pod_restart_{container=""server"", instance=""kube.9"
0,10.0,0.0,0.0,0.0,0.0,11289750000.0,111.0,139.0,178.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.0,0.0,0.0,0.0,0.0,11288670000.0,110.0,139.0,177.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.0,0.0,0.0,0.0,0.0,11289320000.0,110.0,139.0,177.0,197.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16.0,0.0,0.0,0.0,0.0,11292660000.0,110.0,139.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18.0,7.0,262.0,0.0,0.0,11298970000.0,110.0,134.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,20.0,11.0,320.909091,0.0,0.0,11305430000.0,110.0,243.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,22.0,22.0,114.363636,0.0,0.0,11309550000.0,110.0,243.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,24.0,22.0,115.272727,0.0,0.0,11312220000.0,110.0,243.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,26.0,13.0,312.230769,0.0,4009.0,11403480000.0,110.0,243.0,177.0,701.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,28.0,23.0,729.869565,0.0,4009.0,11423620000.0,106.0,243.0,353.0,701.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Prepare and split data

In [15]:
X = df_imputed.drop(['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp'], axis=1)
y = df_imputed['Durchschnittliche Antwortzeitintervalle']

# Aufteilung in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Remove features with low variance

In [15]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_selected = sel.fit_transform(X)

# Welche Features wurden behalten
features_bool = sel.get_support()
features = np.array(X.columns)
print("Behaltene Features:", features[features_bool])

# Welche Features wurden entfernt
print("Entfernte Features:", features[~features_bool])

Behaltene Features: ['three_i_o_read' 'three_i_o_write' 'three_memory' 'system-pod-1'
 'system-pod-2' 'system-pod-3' 'system-pod-4' 'system-pod-5'
 'system-pod-6' 'system-pod-7' 'system-pod-8' 'system-pod-9'
 'system-pod-10' 'system-pod-11' 'system-pod-12' 'system-pod-13'
 'network_outgoing_system' 'one_i_o_read' 'one_i_o_write' 'one_memory'
 'network_outgoing_pod_adservice' 'network_outgoing_pod_cartservice'
 'network_outgoing_pod_checkoutservice'
 'network_outgoing_pod_currencyservice'
 'network_outgoing_pod_emailservice' 'network_outgoing_pod_frontend'
 'network_outgoing_pod_loadgenerator'
 'network_outgoing_pod_loadgenerator.1'
 'network_outgoing_pod_paymentservice'
 'network_outgoing_pod_productcatalogservice'
 'network_outgoing_pod_recommendationservice' 'network_outgoing_pod_redis'
 'network_outgoing_pod_shippingservice' 'two_i_o_read' 'two_i_o_write'
 'two_memory' 'slower_i_o_read' 'slower_i_o_write' 'slower_memory'
 'faster_i_o_read' 'faster_i_o_write' 'faster_memory' 'medium_

## Recursive Feature Selection with decision tree regressor

In [16]:
# Modell initialisieren
estimator = DecisionTreeRegressor()


# Feature Selection
selector = RFECV(estimator, step=2, cv=3)
selector = selector.fit(X_train, y_train)

print("Optimale Anzahl von Features : %d" % selector.n_features_)

# Du kannst dann die ausgewählten Features anzeigen oder nutzen
selected_features = X_train.columns[selector.support_]
print("Ausgewählte Features:", selected_features)
# Identifiziere die nicht ausgewählten Features
not_selected_features = X_train.columns[~selector.support_]
print("Nicht ausgewählte Features:", not_selected_features)

Optimale Anzahl von Features : 88
Ausgewählte Features: Index(['three_i_o_read', 'three_i_o_write', 'three_memory', 'system-pod-1',
       'system-pod-2', 'system-pod-3', 'system-pod-4', 'system-pod-5',
       'system-pod-6', 'system-pod-7', 'system-pod-8', 'system-pod-9',
       'system-pod-10', 'system-pod-11', 'system-pod-12', 'system-pod-13',
       'network_outgoing_system', 'system-pod-1.1', 'system-pod-2.1',
       'system-pod-3.1', 'system-pod-4.1', 'system-pod-5.1', 'system-pod-6.1',
       'system-pod-7.1', 'system-pod-8.1', 'system-pod-9.1', 'system-pod-10.1',
       'system-pod-11.1', 'system-pod-12.1', 'system-pod-13.1', 'cpu_system',
       'one_i_o_read', 'one_i_o_write', 'one_memory',
       'network_outgoing_pod_adservice', 'network_outgoing_pod_cartservice',
       'network_outgoing_pod_checkoutservice',
       'network_outgoing_pod_currencyservice',
       'network_outgoing_pod_emailservice', 'network_outgoing_pod_frontend',
       'network_outgoing_pod_loadgenerator

## Select from model

In [10]:
# LassoCV für die Ermittlung des besten Alpha-Wertes
estimator = DecisionTreeRegressor()

# Verwendung von SelectFromModel, um Features basierend auf der Wichtigkeit auszuwählen
selector = SelectFromModel(estimator=estimator)

# Erstellen eines Pipelines mit LassoCV und SelectFromModel
pipeline = Pipeline([
    ('feature_selection', selector),
    ('regression', estimator)
])

# Anpassen des Modells
pipeline.fit(X_train, y_train)

# Nachdem das Modell angepasst wurde, können Sie die ausgewählten Features ermitteln
selected_features = X_train.columns[pipeline.named_steps['feature_selection'].get_support()]
print("Ausgewählte Features:", selected_features)

Ausgewählte Features: Index(['system-pod-6', 'network_outgoing_system', 'system-pod-6.1',
       'cpu_system', 'one_i_o_read', 'one_i_o_write', 'one_memory',
       'network_outgoing_pod_adservice',
       'network_outgoing_pod_checkoutservice', 'network_outgoing_pod_frontend',
       'network_outgoing_pod_productcatalogservice', 'cpu_pod_checkoutservice',
       'cpu_pod_productcatalogservice', 'cpu_pod_recommendationservice',
       'two_i_o_read', 'slower_i_o_read', 'slower_memory', 'faster_i_o_read'],
      dtype='object')


## Test Own Class

In [18]:
import sys
import os
# Füge das übergeordnete Verzeichnis zum Python-Modulsuchpfad hinzu
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Jetzt kannst du das Modul importieren, als wäre es im gleichen Verzeichnis
from app.feature_selection.recursive_feature_selector import RecursiveFeatureSelector

rfc = RecursiveFeatureSelector(DecisionTreeRegressor(), input_file_path, 'Durchschnittliche Antwortzeitintervalle', ['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp'])

rfc.select_features(2, 3)

TypeError: BaseEstimator._get_tags() missing 1 required positional argument: 'self'