Task 4.4 Supervised Learning - Classification and hyperparameter tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [None]:
#1. Lade den Datensatz
df = pd.read_csv("C:\\Users\\sonja\\Downloads\\cox2.csv", index_col=0)

In [None]:
# übersicht
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 462 entries, 1 to 0
Columns: 256 entries, QikProp_.amine to cox2Class
dtypes: float64(212), int64(43), object(1)
memory usage: 927.6+ KB
None
                QikProp_.amine  QikProp_.acid  QikProp_.rotor  \
QikProp_.stars                                                  
1                            0              0               1   
0                            0              0               2   
0                            0              0               3   
1                            0              0               2   
1                            0              0               2   

                QikProp_.rctvFG  QikProp_CNS  QikProp_MW  QikProp_dipole  \
QikProp_.stars                                                             
1                             0            0     358.882           7.645   
0                             0           -1     359.870           7.959   
0                             0           -1     355.

In [None]:
# 2. Zielspalte: cox2Class (ist vom Typ "object")
print("Zielklassen:", df["cox2Class"].unique())  # z. B. ['Inactive', 'Active']

Zielklassen: ['Inactive' 'Active']


In [None]:
# 3. Features und Target definieren
X = df.drop(columns="cox2Class")
y = df["cox2Class"]


In [7]:
# 4. Daten splitten
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [None]:
# 5. Pipeline mit Skalierung und Klassifikator
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", RandomForestClassifier(random_state=42))
])


In [9]:
# 6. Hyperparameter-Raster
param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10, 20],
    "clf__max_features": ["sqrt", "log2"]
}


In [11]:
# 7. GridSearch mit 10-facher Kreuzvalidierung
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="accuracy", n_jobs=-1)
grid_search.fit(X_train, y_train)

In [12]:
# 8. Beste Parameter und Genauigkeit
print("Beste Parameter:", grid_search.best_params_)
print("Beste Trainingsgenauigkeit (CV):", grid_search.best_score_)

Beste Parameter: {'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__n_estimators': 100}
Beste Trainingsgenauigkeit (CV): 0.9710084033613446


In [13]:
# 9. Vorhersage auf Testdaten
y_pred = grid_search.predict(X_test)

In [14]:
# 10. Bewertung
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Test Accuracy: 0.9827586206896551

Confusion Matrix:
 [[22  1]
 [ 1 92]]

Classification Report:
               precision    recall  f1-score   support

      Active       0.96      0.96      0.96        23
    Inactive       0.99      0.99      0.99        93

    accuracy                           0.98       116
   macro avg       0.97      0.97      0.97       116
weighted avg       0.98      0.98      0.98       116



In [None]:
# 11. Feature Importances
best_model = grid_search.best_estimator_.named_steps["clf"]
importances = pd.Series(best_model.feature_importances_, index=X.columns)
print("Top 10 wichtigste Merkmale:\n", importances.sort_values(ascending=False).head(10))

Top 10 wichtigste Merkmale:
 IC50                 0.303814
moe2D_logS           0.025518
moe2D_SMR_VSA2       0.015870
QikProp_QPlogKhsa    0.012968
moe2D_bpol           0.012697
QikProp_QPlogS       0.012642
QikProp_accptHB      0.011703
QikProp_QPlogPo.w    0.010462
moe2D_GCUT_SMR_1     0.010280
QikProp_QPPMDCK      0.009088
dtype: float64
