# Import everything

In [1]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
import sys
import os
# Füge das übergeordnete Verzeichnis zum Python-Modulsuchpfad hinzu
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
input_file_path = "./../../timeseries/combined/combined_timeseries.csv"
df = pd.read_csv(input_file_path)

## Deal with nan-values

In [2]:
imputer = SimpleImputer(strategy='mean')  # oder eine andere geeignete Strategie
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [3]:
df_imputed.head(10)

Unnamed: 0,Timestamp,Requests je Sekunde,Durchschnittliche Antwortzeitintervalle,i_o_read,i_o_write,memory,network_outgoing_pod-pod-1,network_outgoing_pod-pod-2,network_outgoing_pod-pod-3,network_outgoing_pod-pod-4,...,pod-restart-count-pod-16,pod-restart-count-pod-17,pod-restart-count-pod-18,pod-restart-count-pod-19,pod-restart-count-pod-20,pod-restart-count-pod-21,pod-restart-count-pod-22,pod-restart-count-pod-23,pod-restart-count-pod-24,pod-restart-count-pod-25
0,10.0,0.0,0.0,0.0,0.0,11289750000.0,111.0,139.0,178.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.0,0.0,0.0,0.0,0.0,11288670000.0,110.0,139.0,177.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.0,0.0,0.0,0.0,0.0,11289320000.0,110.0,139.0,177.0,197.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16.0,0.0,0.0,0.0,0.0,11292660000.0,110.0,139.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18.0,7.0,262.0,0.0,0.0,11298970000.0,110.0,134.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,20.0,11.0,320.909091,0.0,0.0,11305430000.0,110.0,243.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,22.0,22.0,114.363636,0.0,0.0,11309550000.0,110.0,243.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,24.0,22.0,115.272727,0.0,0.0,11312220000.0,110.0,243.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,26.0,13.0,312.230769,0.0,4009.0,11403480000.0,110.0,243.0,177.0,701.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,28.0,23.0,729.869565,0.0,4009.0,11423620000.0,106.0,243.0,353.0,701.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Prepare and split data

In [4]:
X = df_imputed.drop(['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp'], axis=1)
y = df_imputed['Durchschnittliche Antwortzeitintervalle']

# Aufteilung in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Remove features with low variance

In [11]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_selected = sel.fit_transform(X)

# Welche Features wurden behalten
features_bool = sel.get_support()
features = np.array(X.columns)
print("Behaltene Features:", features[features_bool])

# Welche Features wurden entfernt
print("Entfernte Features:", features[~features_bool])

Behaltene Features: ['i_o_read' 'i_o_write' 'memory' 'network_outgoing_pod-pod-1'
 'network_outgoing_pod-pod-2' 'network_outgoing_pod-pod-3'
 'network_outgoing_pod-pod-4' 'network_outgoing_pod-pod-5'
 'network_outgoing_pod-pod-6' 'network_outgoing_pod-pod-7'
 'network_outgoing_pod-pod-8' 'network_outgoing_pod-pod-9'
 'network_outgoing_pod-pod-10' 'network_outgoing_pod-pod-11'
 'network_outgoing_pod-pod-12' 'network_outgoing_pod-pod-13'
 'network_outgoing_system' 'network_outgoing_pod-pod-14'
 'network_outgoing_pod-pod-15' 'network_outgoing_pod-pod-16'
 'network_outgoing_pod-pod-17' 'network_outgoing_pod-pod-18'
 'network_outgoing_pod-pod-19' 'network_outgoing_pod-pod-20'
 'network_outgoing_pod-pod-21' 'network_outgoing_pod-pod-22'
 'network_outgoing_pod-pod-23' 'network_outgoing_pod-pod-24'
 'network_outgoing_pod-pod-25']
Entfernte Features: ['cpu_pod-pod-1' 'cpu_pod-pod-2' 'cpu_pod-pod-3' 'cpu_pod-pod-4'
 'cpu_pod-pod-5' 'cpu_pod-pod-6' 'cpu_pod-pod-7' 'cpu_pod-pod-8'
 'cpu_pod-pod-9'

## Recursive Feature Selection with decision tree regressor

In [6]:
# Jetzt kannst du das Modul importieren, als wäre es im gleichen Verzeichnis
from app.feature_selection.recursive_feature_selector import RecursiveFeatureSelector

rfc = RecursiveFeatureSelector(DecisionTreeRegressor(), input_file_path, 'Durchschnittliche Antwortzeitintervalle', ['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp'])

rfc.select_features(2, 3)

Optimale Anzahl von Features : 67
Ausgewählte Features: Index(['i_o_read', 'i_o_write', 'memory', 'network_outgoing_pod-pod-1',
       'network_outgoing_pod-pod-2', 'network_outgoing_pod-pod-3',
       'network_outgoing_pod-pod-4', 'network_outgoing_pod-pod-5',
       'network_outgoing_pod-pod-6', 'network_outgoing_pod-pod-7',
       'network_outgoing_pod-pod-8', 'network_outgoing_pod-pod-9',
       'network_outgoing_pod-pod-10', 'network_outgoing_pod-pod-11',
       'network_outgoing_pod-pod-12', 'network_outgoing_pod-pod-13',
       'network_outgoing_system', 'cpu_pod-pod-1', 'cpu_pod-pod-2',
       'cpu_pod-pod-3', 'cpu_pod-pod-4', 'cpu_pod-pod-5', 'cpu_pod-pod-6',
       'cpu_pod-pod-7', 'cpu_pod-pod-8', 'cpu_pod-pod-9', 'cpu_pod-pod-10',
       'cpu_pod-pod-11', 'cpu_pod-pod-12', 'cpu_pod-pod-13', 'cpu_system',
       'pod-restart-count-pod-1', 'pod-restart-count-pod-2',
       'pod-restart-count-pod-7', 'pod-restart-count-pod-9',
       'network_outgoing_pod-pod-14', 'cpu_pod-pod

## Select from model

In [10]:
# LassoCV für die Ermittlung des besten Alpha-Wertes
estimator = DecisionTreeRegressor()

# Verwendung von SelectFromModel, um Features basierend auf der Wichtigkeit auszuwählen
selector = SelectFromModel(estimator=estimator)

# Erstellen eines Pipelines mit LassoCV und SelectFromModel
pipeline = Pipeline([
    ('feature_selection', selector),
    ('regression', estimator)
])

# Anpassen des Modells
pipeline.fit(X_train, y_train)

# Nachdem das Modell angepasst wurde, können Sie die ausgewählten Features ermitteln
selected_features = X_train.columns[pipeline.named_steps['feature_selection'].get_support()]
print("Ausgewählte Features:", selected_features)

# Ermitteln der nicht ausgewählten Features
not_selected_features = X_train.columns[~pipeline.named_steps['feature_selection'].get_support()]
print("Nicht ausgewählte Features:", not_selected_features)

Ausgewählte Features: Index(['system-pod-6', 'network_outgoing_system', 'system-pod-6.1',
       'cpu_system', 'one_i_o_read', 'one_i_o_write', 'one_memory',
       'network_outgoing_pod_adservice',
       'network_outgoing_pod_checkoutservice', 'network_outgoing_pod_frontend',
       'network_outgoing_pod_productcatalogservice', 'cpu_pod_checkoutservice',
       'cpu_pod_productcatalogservice', 'cpu_pod_recommendationservice',
       'two_i_o_read', 'slower_i_o_read', 'slower_memory', 'faster_i_o_read'],
      dtype='object')


## Test Own Class

In [4]:
# Jetzt kannst du das Modul importieren, als wäre es im gleichen Verzeichnis
from app.feature_selection.model_feature_selector import ModelFeatureSelector

mfc = ModelFeatureSelector(
    DecisionTreeRegressor(), 
    input_file_path, 
    'Durchschnittliche Antwortzeitintervalle', 
    ['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp']
)

mfc.select_features()

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.