# Import everything

In [1]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.svm import SVR
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
import sys
import os
# Füge das übergeordnete Verzeichnis zum Python-Modulsuchpfad hinzu
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
input_file_path = "./../../timeseries/combined/combined_timeseries.csv"
df = pd.read_csv(input_file_path)

## Deal with nan-values

In [2]:
imputer = SimpleImputer(strategy='mean')  # oder eine andere geeignete Strategie
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [3]:
df_imputed.head(5)

Unnamed: 0,Timestamp,Requests je Sekunde,Durchschnittliche Antwortzeitintervalle,i_o_read,i_o_write,memory,network_outgoing_pod-pod-1,network_outgoing_pod-pod-2,network_outgoing_pod-pod-3,network_outgoing_pod-pod-4,...,pod-restart-count-pod-16,pod-restart-count-pod-17,pod-restart-count-pod-18,pod-restart-count-pod-19,pod-restart-count-pod-20,pod-restart-count-pod-21,pod-restart-count-pod-22,pod-restart-count-pod-23,pod-restart-count-pod-24,pod-restart-count-pod-25
0,10.0,0.0,0.0,0.0,0.0,11289750000.0,111.0,139.0,178.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.0,0.0,0.0,0.0,0.0,11288670000.0,110.0,139.0,177.0,153.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,14.0,0.0,0.0,0.0,0.0,11289320000.0,110.0,139.0,177.0,197.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16.0,0.0,0.0,0.0,0.0,11292660000.0,110.0,139.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,18.0,7.0,262.0,0.0,0.0,11298970000.0,110.0,134.0,177.0,203.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Prepare and split data

In [4]:
X = df_imputed.drop(['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp'], axis=1)
y = df_imputed['Durchschnittliche Antwortzeitintervalle']

# Aufteilung in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## Remove features with low variance

In [5]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_selected = sel.fit_transform(X)

# Welche Features wurden behalten
features_bool = sel.get_support()
features = np.array(X.columns)
# print("Behaltene Features:", features[features_bool])

# Welche Features wurden entfernt
# print("Entfernte Features:", features[~features_bool])

## Recursive Feature Selection with decision tree regressor

In [6]:
# Jetzt kannst du das Modul importieren, als wäre es im gleichen Verzeichnis
from app.feature_selection.recursive_feature_selector import RecursiveFeatureSelector

rfc = RecursiveFeatureSelector(DecisionTreeRegressor(), input_file_path, 'Durchschnittliche Antwortzeitintervalle', ['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp'])

rfc.select_features(1, 3)

Optimale Anzahl von Features : 44


Index(['pod-restart-count-pod-1', 'pod-restart-count-pod-2',
       'pod-restart-count-pod-3', 'pod-restart-count-pod-4',
       'pod-restart-count-pod-5', 'pod-restart-count-pod-6',
       'pod-restart-count-pod-8', 'pod-restart-count-pod-10',
       'pod-restart-count-pod-11', 'pod-restart-count-pod-12',
       'pod-restart-count-pod-13', 'pod_restart',
       'network_outgoing_pod-pod-15', 'network_outgoing_pod-pod-16',
       'network_outgoing_pod-pod-17', 'network_outgoing_pod-pod-18',
       'network_outgoing_pod-pod-19', 'network_outgoing_pod-pod-21',
       'network_outgoing_pod-pod-22', 'cpu_pod-pod-16', 'cpu_pod-pod-17',
       'cpu_pod-pod-19', 'cpu_pod-pod-20', 'cpu_pod-pod-22', 'cpu_pod-pod-25',
       'pod-restart-count-pod-14', 'pod-restart-count-pod-15',
       'pod-restart-count-pod-16', 'pod-restart-count-pod-17',
       'pod-restart-count-pod-18', 'pod-restart-count-pod-19',
       'pod-restart-count-pod-20', 'pod-restart-count-pod-21',
       'pod-restart-count-pod-

## Select from model scikit-algorithm

In [7]:
# Jetzt kannst du das Modul importieren, als wäre es im gleichen Verzeichnis
from app.feature_selection.model_feature_selector import ModelFeatureSelector

mfc = ModelFeatureSelector(
    DecisionTreeRegressor(), 
    input_file_path, 
    'Durchschnittliche Antwortzeitintervalle', 
    ['Durchschnittliche Antwortzeitintervalle', 'Requests je Sekunde', 'Timestamp']
)

mfc.select_features()

Index(['network_outgoing_pod-pod-1', 'network_outgoing_pod-pod-3',
       'network_outgoing_pod-pod-4', 'network_outgoing_pod-pod-5',
       'network_outgoing_pod-pod-6', 'network_outgoing_pod-pod-9',
       'network_outgoing_pod-pod-10', 'network_outgoing_pod-pod-11',
       'network_outgoing_system', 'cpu_pod-pod-1', 'cpu_pod-pod-2',
       'cpu_pod-pod-3', 'cpu_pod-pod-5', 'cpu_pod-pod-6', 'cpu_pod-pod-8',
       'cpu_pod-pod-9', 'cpu_pod-pod-11', 'cpu_pod-pod-13', 'cpu_system',
       'pod-restart-count-pod-1', 'pod-restart-count-pod-2',
       'pod-restart-count-pod-3', 'pod-restart-count-pod-4',
       'pod-restart-count-pod-5', 'pod-restart-count-pod-6',
       'pod-restart-count-pod-8', 'pod-restart-count-pod-10',
       'pod-restart-count-pod-11', 'pod-restart-count-pod-12',
       'pod-restart-count-pod-13', 'pod_restart',
       'network_outgoing_pod-pod-14', 'cpu_pod-pod-14',
       'network_outgoing_pod-pod-15', 'network_outgoing_pod-pod-16',
       'network_outgoing_pod-p