In [3]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, make_scorer
import pandas as pd
from skopt import BayesSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFE

In [4]:
features_df = pd.read_csv('ctd_cadiovascular_20240223_scores.tsv', sep='\t')
ground_truth_df = pd.read_csv('merged_cardio.tsv', sep='\t')

print(features_df)
print(ground_truth_df)

           drugA    drugB     sAB  opAB  meanspAB  medianspAB  minspAB  \
0        DB06709  DB08506  0.3427     0    0.6150       0.630     0.46   
1        DB06709  DB02376  0.4188     0    0.6850       0.685     0.56   
2        DB06709  DB04282  0.4188     0    0.6850       0.685     0.56   
3        DB06709  DB04564  0.4188     0    0.6850       0.685     0.56   
4        DB06709  DB04659  0.4188     0    0.6850       0.685     0.56   
...          ...      ...     ...   ...       ...         ...      ...   
9730661  DB08873  DB07191  0.1295     0    0.6093       0.625     0.44   
9730662  DB08873  DB08846  0.1319     0    0.4865       0.460     0.20   
9730663  DB08873  DB08865  0.1585     0    0.4858       0.480     0.26   
9730664  DB08873  DB05891  0.1004     0    0.8725       0.870     0.65   
9730665  DB08873  DB05508  0.2084     0    0.5350       0.530     0.41   

         maxspAB    zTDA    zTDB  ...  opAD  opBD  meanspAD  meanspBD  \
0           0.74  0.5097  1.3545  ... 

In [5]:
merged_df = pd.merge(features_df, ground_truth_df, on=['drugA', 'drugB'])

merged_df.set_index(['drugA', 'drugB'], inplace=True)

X = merged_df.drop(['adv/app','drugcomb','sA','sB','opA','opB'], axis=1)                     
Y = merged_df['adv/app']


In [6]:
#run this: oversampled
#from imblearn.over_sampling import SMOTE
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

#sm = SMOTE(random_state=42)

#X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

In [7]:
#run this: normal 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, test_size=0.2,random_state=42)

In [None]:
# Step 1: Define the search space for Bayesian Optimization
search_spaces = {
    'max_depth': (1, 20),
    'min_samples_leaf': (1, 20),
    'min_samples_split': (2, 20),
    'class_weight': ['balanced', None]
}

# Step 2: Define a custom scorer for accuracy to use with BayesSearchCV
accuracy_scorer = make_scorer(accuracy_score)

# Step 3: Instantiate SMOTETomek
optimizer = SMOTE(random_state=42)

# Step 4: Apply Oversampling
X_train_resampled, Y_train_resampled = optimizer.fit_resample(X_train, Y_train)

# Step 5: Recursive Feature Elimination
dt_classifier = DecisionTreeClassifier(random_state=42)
rfe = RFE(estimator=dt_classifier, n_features_to_select=10, step=1)
X_train_resampled_rfe = rfe.fit_transform(X_train_resampled, Y_train_resampled)
X_test_rfe = rfe.transform(X_test)

# Step 6: Bayesian Optimization
opt = BayesSearchCV(
    estimator=DecisionTreeClassifier(),
    search_spaces=search_spaces,
    n_iter=10,
    scoring=accuracy_scorer,
    cv=5
)
opt.fit(X_train_resampled_rfe, Y_train_resampled)

In [8]:
# Definiere das Grid der Hyperparameter, die du testen möchtest
param_grid = {
    'n_estimators': [100, 200, 300],  # Anzahl der Bäume
    'max_depth': [None, 10, 20, 30],  # Maximale Tiefe der Bäume
    'min_samples_split': [2, 5, 10],  # Minimale Anzahl von Samples, um einen Knoten zu teilen
    'min_samples_leaf': [1, 2, 4],  # Minimale Anzahl von Samples in einem Blatt
    'bootstrap': [True, False]  # Methode für das Sampling der Datensätze
}


In [9]:
# Initialisiere den RandomForestClassifier
rf = RandomForestClassifier()

# Initialisiere den GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='f1_weighted')

# Führe den GridSearch aus
grid_search.fit(X_train_resampled_rfe, Y_train_resampled)


# Zeige die besten Parameter und den besten Score an
print("Beste Parameter:", grid_search.best_params_)
print("Bester Score:", grid_search.best_score_)

Beste Parameter: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Bester Score: 0.9931439988969387


In [18]:
# Initialisiere und trainiere das Modell mit den besten Parametern
model = RandomForestClassifier( random_state=42)
model.fit(X_train_resampled_rfe, Y_train_resampled)

# Mache Vorhersagen auf dem Testset
y_pred = model.predict(X_test_rfe)

# Berechne und drucke verschiedene Fehlermetriken aus
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred, average='binary')
recall = recall_score(Y_test, y_pred, average='binary')
f1 = f1_score(Y_test, y_pred, average='weighted')
mcc = matthews_corrcoef(Y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Matthews Correlation Coefficient: {mcc}")

'''
with tuned hyperparameters: bootstrap= False , max_depth= 20, min_samples_leaf=1,min_samples_split=2, n_estimators=100, random_state=42
Accuracy: 0.9941598360655738
Precision: 0.35
Recall: 0.13725490196078433
F1-Score: 0.9928895250541236
Matthews Correlation Coefficient: 0.21669468006516002

max_depth= 20, min_samples_leaf=1,min_samples_split=2, n_estimators=100, random_state=42

Accuracy: 0.9942622950819672
Precision: 0.3684210526315789
Recall: 0.13725490196078433
F1-Score: 0.9929555396350459
Matthews Correlation Coefficient: 0.2224811074144708


'''

Accuracy: 0.9942622950819672
Precision: 0.3684210526315789
Recall: 0.13725490196078433
F1-Score: 0.9929555396350459
Matthews Correlation Coefficient: 0.2224811074144708


'\nwith tuned hyperparameters: bootstrap= False , max_depth= 20, min_samples_leaf=1,min_samples_split=2, n_estimators=100, random_state=42\nAccuracy: 0.9941598360655738\nPrecision: 0.35\nRecall: 0.13725490196078433\nF1-Score: 0.9928895250541236\nMatthews Correlation Coefficient: 0.21669468006516002\n\nmax_depth= 20, min_samples_leaf=1,min_samples_split=2, n_estimators=100, random_state=42\n\nAccuracy: 0.9942622950819672\nPrecision: 0.3684210526315789\nRecall: 0.13725490196078433\nF1-Score: 0.9929555396350459\nMatthews Correlation Coefficient: 0.2224811074144708\n\n\n'