In [1]:
import os

if "run_molecular_property_based_filtering.py" not in os.listdir():
    os.chdir("../src")

os.listdir()

['database',
 'utils',
 'screening',
 'run_molecular_property_based_filtering.py',
 '__init__.py']

---

In [2]:
import pandas as pd
from rdkit import Chem
from screening import MolecularPropertyCalculator

In [3]:
df_wrong_Lipinski_Rule_of_5_and_Drug_like_Filter = pd.read_csv(
    "../data/chembl_34_initial_screening_results.csv", sep=",", index_col=0
)
df_correct_ADMET_Scores_Filter = pd.read_csv("../data/admet.csv", sep=",", index_col=0)
df_correct_Drug_like_Filter = pd.read_csv(
    "../data/corrected_lipinski5_and_druglikeness_actually_just_drug_likeness.csv",
    sep=",",
    index_col=0,
)
df_correct_Lipinski_Rule_of_5_and = pd.read_csv(
    "../data/finally_corrected_lipinski5.csv", sep=",", index_col=0
)

df_combined = (
    df_wrong_Lipinski_Rule_of_5_and_Drug_like_Filter[
        [
            "Ghose_Filter",
            "Veber_Filter",
            "REOS_Filter",
            "QED_Filter",
            "SA_Score_Filter",
            "NP_Score_Filter",
        ]
    ]
    .join(
        df_correct_ADMET_Scores_Filter[["ADMET_Scores_Filter"]],
        lsuffix="_df",
        rsuffix="_admet",
    )
    .join(
        df_correct_Drug_like_Filter[["Drug_like_Filter"]],
        lsuffix="_df",
        rsuffix="_lipinski_druglike",
    )
    .join(
        df_correct_Lipinski_Rule_of_5_and[["Lipinski_Rule_of_5"]],
        lsuffix="_lipinski_druglike",
        rsuffix="_lipinski",
    )
)

df_combined

Unnamed: 0_level_0,Ghose_Filter,Veber_Filter,REOS_Filter,QED_Filter,SA_Score_Filter,NP_Score_Filter,ADMET_Scores_Filter,Drug_like_Filter,Lipinski_Rule_of_5
ChemblID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CHEMBL153534,False,True,True,0.608781,3.048474,-1.552303,1.138080,True,
CHEMBL440060,False,False,False,0.016357,9.084590,-0.037746,2570.754431,False,
CHEMBL440245,False,False,False,0.010798,9.752879,0.219818,3230.738108,False,
CHEMBL440249,False,False,False,0.016726,9.146251,0.632360,2274.126532,False,
CHEMBL405398,True,True,True,0.623114,2.595185,-1.628093,98.871021,True,
...,...,...,...,...,...,...,...,...,...
CHEMBL4297438,True,True,False,0.432167,2.609728,-0.750485,77.401459,False,
CHEMBL4298636,False,False,False,0.040094,7.525295,0.447542,1095.926541,False,
CHEMBL4296948,False,True,True,0.484395,3.201334,-1.142129,0.671320,True,
CHEMBL4296947,False,True,True,0.671745,2.972796,-1.148292,0.026100,True,


In [4]:
def get_molecule_filter(
    Lipinski_Rule_of_5: bool = True,
    Ghose_Filter: bool = True,
    REOS_Filter: bool = True,
    Veber_Filter: bool = True,
    Drug_like_Filter: bool = True,
    QED_Filter_threshold: None | float = None,
    SA_Score_Filter_threshold: None | float = None,
    NP_Score_Filter_threshold: None | float = None,
    ADMET_Scores_Filter: None | float = None,
) -> callable:

    def molecule_filter(row):
        return (
            (row["Lipinski_Rule_of_5"] or not Lipinski_Rule_of_5)
            and (row["Ghose_Filter"] or not Ghose_Filter)
            and (row["REOS_Filter"] or not REOS_Filter)
            and (row["Veber_Filter"] or not Veber_Filter)
            and (row["Drug_like_Filter"] or not Drug_like_Filter)
            and (
                QED_Filter_threshold is None
                or row["QED_Filter"] >= QED_Filter_threshold
            )
            and (
                SA_Score_Filter_threshold is None
                or row["SA_Score_Filter"] <= SA_Score_Filter_threshold
            )
            and (
                NP_Score_Filter_threshold is None
                or row["NP_Score_Filter"] >= NP_Score_Filter_threshold
            )
            and (
                ADMET_Scores_Filter is None
                or row["ADMET_Scores_Filter"] <= ADMET_Scores_Filter
            )
        )

    return molecule_filter

In [5]:
filter_fn = get_molecule_filter(
    Lipinski_Rule_of_5=True,
    Ghose_Filter=True,
    REOS_Filter=True,
    Veber_Filter=True,
    Drug_like_Filter=True,
    QED_Filter_threshold=0.5,  # 0.7,
    SA_Score_Filter_threshold=4,  # 3,
    NP_Score_Filter_threshold=0.2,
    ADMET_Scores_Filter=None,  # 2.5,
)

---

In [6]:
df_combined_filtered = df_combined[df_combined.apply(filter_fn, axis=1)]

In [7]:
df_combined_filtered

Unnamed: 0_level_0,Ghose_Filter,Veber_Filter,REOS_Filter,QED_Filter,SA_Score_Filter,NP_Score_Filter,ADMET_Scores_Filter,Drug_like_Filter,Lipinski_Rule_of_5
ChemblID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
CHEMBL503634,True,True,True,0.553150,2.658629,0.996593,0.371830,True,
CHEMBL444368,True,True,True,0.844289,3.029199,1.461576,0.136600,True,
CHEMBL444434,True,True,True,0.609140,3.479000,2.172817,16.105353,True,
CHEMBL500790,True,True,True,0.773286,2.457191,0.911889,1.749700,True,
CHEMBL503603,True,True,True,0.615338,2.946122,0.985238,57.117686,True,
...,...,...,...,...,...,...,...,...,...
CHEMBL4296991,True,True,True,0.758058,2.918915,1.059271,21.142631,True,
CHEMBL4296988,True,True,True,0.772957,3.874596,1.167360,24.341443,True,
CHEMBL4296899,True,True,True,0.628762,2.863348,1.401968,10.650672,True,
CHEMBL4297606,True,True,True,0.660871,2.249330,0.407477,56.549368,True,


In [17]:
known_inhibitors_df = pd.read_csv("../data/SMILES_inh.csv", sep=",", index_col=0)

known_inhibitors_mol = {}

for row in known_inhibitors_df.iterrows():
    mol = Chem.MolFromSmiles(row[1]["smiles"])
    # print(
    #     f"molecular_weight: {Chem.Descriptors.MolWt(mol)} | logp: {Chem.Crippen.MolLogP(mol)} | h_bond_donor: {Chem.rdMolDescriptors.CalcNumHBD(mol)} | h_bond_acceptors: {Chem.rdMolDescriptors.CalcNumHBA(mol)} | rotatable_bonds: {Chem.rdMolDescriptors.CalcNumRotatableBonds(mol)}"
    # )
    # display(mol)
    known_inhibitors_mol[row[0]] = mol

mpc = MolecularPropertyCalculator()

known_inhibitors_results_df = pd.DataFrame(
    columns=["inhibitor_id", *mpc.get_result_keys()]
)

for key, value in known_inhibitors_mol.items():
    basic_filters = mpc.calculate_basic_filters(value)
    known_inhibitors_results_df = pd.concat(
        [
            known_inhibitors_results_df,
            pd.DataFrame([{**{"inhibitor_id": key}, **basic_filters}]),
        ],
        ignore_index=True,
    )
    print(basic_filters)
known_inhibitors_results_df.set_index("inhibitor_id", inplace=True)

reading NP model ...
model in
  known_inhibitors_results_df = pd.concat(
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.3743876950069787, 'TPSA_Filter': 136.60999999999999, 'SA_Score_Filter': 3.339818647924842, 'NP_Score_Filter': -1.0793809024132779, 'ADMET_Scores_Filter': 181.35974332800006, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.5383840929600127, 'TPSA_Filter': 84.57, 'SA_Score_Filter': 3.0519247847307094, 'NP_Score_Filter': -1.181657016061194, 'ADMET_Scores_Filter': 151.24914526400008, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.5160541200481502, 'TPSA_Filter': 103.03, 'SA_Score_Filter': 3.147471553961478, 'NP_Score_Filter': -1.2945987904295735, 'ADMET_Scores_Filter': 155.478274376, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.6703189198745959, 'TPSA_Filter': 78.09, 'SA_Score_Filter': 2.4672436266777993, 'NP_Score_Filter': -1.7741784152680113, 'ADMET_Scores_Filter': 40.402746944000015, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.5993233904365186, 'TPSA_Filter': 78.09, 'SA_Score_Filter': 2.589869788706359, 'NP_Score_Filter': -1.7018624492824264, 'ADMET_Scores_Filter': 95.76249713599996, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.44214882322171206, 'TPSA_Filter': 78.09, 'SA_Score_Filter': 2.422425242063934, 'NP_Score_Filter': -1.5362564457085903, 'ADMET_Scores_Filter': 103.59869700800002, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.441458695232176, 'TPSA_Filter': 86.88, 'SA_Score_Filter': 2.3202299893945284, 'NP_Score_Filter': -1.3002808880486378, 'ADMET_Scores_Filter': 74.83509688000002, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.3755305381328262, 'TPSA_Filter': 86.88, 'SA_Score_Filter': 2.4642740327381905, 'NP_Score_Filter': -1.08077130892823, 'ADMET_Scores_Filter': 98.07364720000001, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.5963983399699746, 'TPSA_Filter': 86.88, 'SA_Score_Filter': 2.460566268865289, 'NP_Score_Filter': -1.216696622219725, 'ADMET_Scores_Filter': 66.99889700800001, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


reading NP model ...
model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.25830813881593306, 'TPSA_Filter': 78.09, 'SA_Score_Filter': 2.512693248309688, 'NP_Score_Filter': -1.2858858809970657, 'ADMET_Scores_Filter': 161.22169726400003, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.35242263279998387, 'TPSA_Filter': 78.09, 'SA_Score_Filter': 2.5640985316935954, 'NP_Score_Filter': -1.4460985136921656, 'ADMET_Scores_Filter': 126.83724732800002, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.1934804210654068, 'TPSA_Filter': 78.09, 'SA_Score_Filter': 2.559709765132517, 'NP_Score_Filter': -1.2163785360783055, 'ADMET_Scores_Filter': 190.03319739200006, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.5322699415050953, 'TPSA_Filter': 86.88, 'SA_Score_Filter': 2.4173396433098144, 'NP_Score_Filter': -1.3644123033388422, 'ADMET_Scores_Filter': 40.45064694400001, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.49458985063423866, 'TPSA_Filter': 86.88, 'SA_Score_Filter': 2.421107618642756, 'NP_Score_Filter': -1.2403538501247, 'ADMET_Scores_Filter': 54.856397008000016, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.2942323158662758, 'TPSA_Filter': 86.88, 'SA_Score_Filter': 2.499418318078318, 'NP_Score_Filter': -1.0132231021202156, 'ADMET_Scores_Filter': 126.88514732800002, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}


model in
reading NP model ...
model in
reading NP model ...


{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.22721592636934843, 'TPSA_Filter': 128.37, 'SA_Score_Filter': 2.850344591850895, 'NP_Score_Filter': -1.144330817925431, 'ADMET_Scores_Filter': 89.12937521600007, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.2329744600738254, 'TPSA_Filter': 117.37, 'SA_Score_Filter': 2.862005242063937, 'NP_Score_Filter': -1.3284223957601384, 'ADMET_Scores_Filter': 116.11647534400008, 'Errors': ['LipinskiRuleOf5.apply() takes 5 positional arguments but 6 were given']}
{'Lipinski_Rule_of_5': None, 'Ghose_Filter': None, 'Veber_Filter': None, 'Rule_of_3_Filter': None, 'REOS_Filter': None, 'Drug_like_Filter': None, 'QED_Filter': 0.16233835908301472, 'T

model in
reading NP model ...
model in


In [9]:
known_inhibitors_results_df

Unnamed: 0_level_0,Lipinski_Rule_of_5,Ghose_Filter,Veber_Filter,Rule_of_3_Filter,REOS_Filter,Drug_like_Filter,QED_Filter,TPSA_Filter,SA_Score_Filter,NP_Score_Filter,ADMET_Scores_Filter,Errors
inhibitor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,,,,,,,0.374388,136.61,3.339819,-1.079381,181.359743,[LipinskiRuleOf5.apply() takes 5 positional ar...
1,,,,,,,0.538384,84.57,3.051925,-1.181657,151.249145,[LipinskiRuleOf5.apply() takes 5 positional ar...
2,,,,,,,0.516054,103.03,3.147472,-1.294599,155.478274,[LipinskiRuleOf5.apply() takes 5 positional ar...
3,,,,,,,0.670319,78.09,2.467244,-1.774178,40.402747,[LipinskiRuleOf5.apply() takes 5 positional ar...
4,,,,,,,0.599323,78.09,2.58987,-1.701862,95.762497,[LipinskiRuleOf5.apply() takes 5 positional ar...
5,,,,,,,0.442149,78.09,2.422425,-1.536256,103.598697,[LipinskiRuleOf5.apply() takes 5 positional ar...
6,,,,,,,0.441459,86.88,2.32023,-1.300281,74.835097,[LipinskiRuleOf5.apply() takes 5 positional ar...
7,,,,,,,0.375531,86.88,2.464274,-1.080771,98.073647,[LipinskiRuleOf5.apply() takes 5 positional ar...
8,,,,,,,0.596398,86.88,2.460566,-1.216697,66.998897,[LipinskiRuleOf5.apply() takes 5 positional ar...
9,,,,,,,0.258308,78.09,2.512693,-1.285886,161.221697,[LipinskiRuleOf5.apply() takes 5 positional ar...


In [10]:
known_inhibitors_results_df.apply(filter_fn, axis=1)

inhibitor_id
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
dtype: bool

---

In [11]:
known_inhibitors_results_df.keys()

Index(['Lipinski_Rule_of_5', 'Ghose_Filter', 'Veber_Filter',
       'Rule_of_3_Filter', 'REOS_Filter', 'Drug_like_Filter', 'QED_Filter',
       'TPSA_Filter', 'SA_Score_Filter', 'NP_Score_Filter',
       'ADMET_Scores_Filter', 'Errors'],
      dtype='object')

In [12]:
known_inhibitors_results_df[
    [
        "Ghose_Filter",
        "Veber_Filter",
        "REOS_Filter",
        "QED_Filter",
        "SA_Score_Filter",
        "NP_Score_Filter",
    ]
]

Unnamed: 0_level_0,Ghose_Filter,Veber_Filter,REOS_Filter,QED_Filter,SA_Score_Filter,NP_Score_Filter
inhibitor_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,,,,0.374388,3.339819,-1.079381
1,,,,0.538384,3.051925,-1.181657
2,,,,0.516054,3.147472,-1.294599
3,,,,0.670319,2.467244,-1.774178
4,,,,0.599323,2.58987,-1.701862
5,,,,0.442149,2.422425,-1.536256
6,,,,0.441459,2.32023,-1.300281
7,,,,0.375531,2.464274,-1.080771
8,,,,0.596398,2.460566,-1.216697
9,,,,0.258308,2.512693,-1.285886


---

In [13]:
# import requests


# def get_smiles_from_chembl(chembl_id):
#     url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{chembl_id}"
#     response = requests.get(url)
#     if response.status_code == 200:
#         try:
#             data = response.json()
#             return data.get("molecule_structures", {}).get("canonical_smiles", None)
#         except requests.JSONDecodeError:
#             print(f"Failed to decode JSON for {chembl_id}")
#             return None
#     else:
#         print(
#             f"Failed to retrieve data for {chembl_id}, status code: {response.status_code}"
#         )
#         return None


# # Example usage
# chembl_id = "CHEMBL503634"
# smiles = get_smiles_from_chembl(chembl_id)
# if smiles:
#     print(f"{chembl_id}: {smiles}")
# else:
#     print(f"SMILES not found for {chembl_id}")

In [14]:
# get_smiles_from_chembl("CHEMBL503634")