Task 4.3 Supervised Learning - Regression and hyperparameter tuning

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# 1. Datensatz laden
df = pd.read_csv("C:\\Users\\sonja\\Downloads\\BloodBrain.csv", index_col=0)

In [None]:
#Überblick
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 208 entries, 12.03 to 64.43
Columns: 134 entries, nbasic to logBBB
dtypes: float64(118), int64(16)
memory usage: 219.4 KB
None
           nbasic  negative    vsa_hyd  a_aro   weight  peoe_vsa.0  \
tpsa                                                                 
12.030000       1         0  167.06700      0  156.293    76.94749   
49.330002       0         0   92.64243      6  151.165    38.24339   
50.529999       1         0  295.16700     15  366.485    58.05473   
37.389999       0         0  319.11220     15  382.552    62.23933   
37.389999       1         0  299.65800     12  326.464    74.80064   

           peoe_vsa.1  peoe_vsa.2  peoe_vsa.3  peoe_vsa.4  ...  ctdh  ctaa  \
tpsa                                                       ...               
12.030000    43.44619     0.00000    0.000000     0.00000  ...     1     1   
49.330002    25.52006     0.00000    8.619013    23.27370  ...     2     2   
50.529999   124.74020    

In [None]:
# 2. Define features and target
X = df.drop(columns="logBBB")
y = df["logBBB"]

In [26]:
# 3. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_train)
print("-------------------------------------------------------------------------")
print(X_test)

           nbasic  negative    vsa_hyd  a_aro   weight  peoe_vsa.0  \
tpsa                                                                 
41.490002       1         0  205.30960     12  282.363    36.50217   
87.089996       1         0  243.77400     17  338.391    77.85693   
40.619999       1         0  318.79040     12  403.591    93.87292   
72.500000       0         0  270.90000     11  373.416   103.27910   
93.029999       0         0  124.66460      5  236.231    17.06154   
...           ...       ...        ...    ...      ...         ...   
46.330002       0         0  162.50200     12  236.274     0.00000   
83.849998       0         0  209.53080     11  357.232   129.13190   
69.260002       0         0  232.50950     11  344.374    44.00552   
57.529999       0         0   65.44252      6  136.106     0.00000   
74.790001       0         0  118.79620     11  219.292    16.43950   

           peoe_vsa.1  peoe_vsa.2  peoe_vsa.3  peoe_vsa.4  ...   scaa3  ctdh  \
tpsa     

In [None]:
# 4. Pipeline with preprocessing and model
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestRegressor(random_state=42))
])

In [None]:
# 5. Define tuning grid
param_grid = {
    "rf__n_estimators": [50, 100],
    "rf__max_depth": [None, 10, 20],
    "rf__max_features": ["sqrt", "log2"]
}

In [None]:
# 6. 10-fold cross-validation with GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring="r2", n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# 7. Evaluate on test set
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Beste Parameter:", grid_search.best_params_)
print("Test-MSE:", mse)
print("Test-R²:", r2)

Beste Parameter: {'rf__max_depth': 10, 'rf__max_features': 'log2', 'rf__n_estimators': 50}
Test-MSE: 0.2746398538155317
Test-R²: 0.3430538086699708


In [None]:
# 8. Feature Importances
best_model = grid_search.best_estimator_.named_steps["rf"]
importances = pd.Series(best_model.feature_importances_, index=X.columns)
print("Top 10 wichtige Merkmale:")
print(importances.sort_values(ascending=False).head(10))

Top 10 wichtige Merkmale:
tcnp                    0.052176
prx                     0.043097
polar_area              0.033898
most_positive_charge    0.031687
rpcg                    0.029715
clogp                   0.026683
psa_npsa                0.025603
pnsa3                   0.024780
fnsa3                   0.024613
vsa_other               0.021702
dtype: float64
