In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

data = pd.read_csv('tested_molecules_with_descriptors.csv')

X = data.drop(['SMILES', 'PKM2_inhibition', 'ERK2_inhibition'], axis=1)
y_PKM2 = data['PKM2_inhibition']
y_ERK2 = data['ERK2_inhibition']

# Remove constant features
constant_filter = VarianceThreshold(threshold=0.0)
X_constant_removed = constant_filter.fit_transform(X)
columns_kept = X.columns[constant_filter.get_support()]

X_filtered = pd.DataFrame(X_constant_removed, columns=columns_kept)

# Compute correlation matrix and select upper triangle of correlation matrix
corr_matrix = X_filtered.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
# Drop highly correlated features
X_filtered = X_filtered.drop(columns=to_drop)

# Univariate feature selection for PKM2
selector_PKM2 = SelectKBest(score_func=f_classif, k=50)
X_best_PKM2 = selector_PKM2.fit_transform(X_filtered, y_PKM2)
# Univariate feature selection for ERK2
selector_ERK2 = SelectKBest(score_func=f_classif, k=50)
X_best_ERK2 = selector_ERK2.fit_transform(X_filtered, y_ERK2)

# Split into training and testing sets for PKM2
X_train_PKM2, X_test_PKM2, y_train_PKM2, y_test_PKM2 = train_test_split(X_best_PKM2, y_PKM2, test_size=0.2, random_state=42)
# Split into training and testing sets for ERK2
X_train_ERK2, X_test_ERK2, y_train_ERK2, y_test_ERK2 = train_test_split(X_best_ERK2, y_ERK2, test_size=0.2, random_state=42)

# Model training with Random Forest for PKM2
model_PKM2 = RandomForestClassifier(random_state=42)
model_PKM2.fit(X_train_PKM2, y_train_PKM2)
# Model training with Random Forest for ERK2
model_ERK2 = RandomForestClassifier(random_state=42)
model_ERK2.fit(X_train_ERK2, y_train_ERK2)

# Cross-validation for PKM2
scores_PKM2 = cross_val_score(model_PKM2, X_train_PKM2, y_train_PKM2, cv=5)
# Cross-validation for ERK2
scores_ERK2 = cross_val_score(model_ERK2, X_train_ERK2, y_train_ERK2, cv=5)

# Print cross-validation scores
print("Cross-validation scores for PKM2:", scores_PKM2)
print("Cross-validation scores for ERK2:", scores_ERK2)

# Feature importance analysis
feature_importances_PKM2 = model_PKM2.feature_importances_
feature_importances_ERK2 = model_ERK2.feature_importances_

important_features_PKM2 = pd.DataFrame({'Feature': X_filtered.columns[selector_PKM2.get_support()], 'Importance': feature_importances_PKM2})
important_features_ERK2 = pd.DataFrame({'Feature': X_filtered.columns[selector_ERK2.get_support()], 'Importance': feature_importances_ERK2})

important_features_PKM2.sort_values(by='Importance', ascending=False, inplace=True)
important_features_ERK2.sort_values(by='Importance', ascending=False, inplace=True)

# Print the top 10 descriptors for PKM2 and ERK2
print("Top 10 descriptors for PKM2 inhibition:")
print(important_features_PKM2.head(10))

print("\nTop 10 descriptors for ERK2 inhibition:")
print(important_features_ERK2.head(10))


Cross-validation scores for PKM2: [0.97206704 0.97206704 0.97752809 0.97752809 0.97752809]
Cross-validation scores for ERK2: [0.96089385 0.95530726 0.96067416 0.96067416 0.96067416]
Top 10 descriptors for PKM2 inhibition:
         Feature  Importance
1            qed    0.045424
8       BalabanJ    0.043209
4   BCUT2D_MWLOW    0.041838
7         AvgIpc    0.038109
14     PEOE_VSA5    0.037145
29   VSA_EState1    0.033196
9        BertzCT    0.032307
2          MolWt    0.031480
18      SMR_VSA7    0.030700
28   EState_VSA6    0.030388

Top 10 descriptors for ERK2 inhibition:
              Feature  Importance
29        VSA_EState1    0.042834
1   MinAbsEStateIndex    0.040711
25               TPSA    0.036650
37            MolLogP    0.035639
27        EState_VSA4    0.034507
7        BCUT2D_MRLOW    0.033667
10            BertzCT    0.033185
30        VSA_EState2    0.032459
8              AvgIpc    0.031496
3    MinPartialCharge    0.031138
