In [30]:
from chembl_webresource_client.new_client import new_client
from chembl_webresource_client.query_set import QuerySet
import pandas as pd

In [43]:
def QuerySetMWRangeFilter(less_limit: int = 0, greater_limit: int = 12_546_42) -> QuerySet:
  """      
  QuerySet Molecular Weight Range Filter - функция, которая выполняет фильтрацию по базе ChEMBL
  по диапазону молекулярного веса (обе границы включительно)

  Args:
      less_limit (int): нижняя граница
      greater_limit (int): верхняя граница
  
  Raises:
      ValueError: если границы меньше являются отриц. числами
      ValueError: если, верхняя граница больше нижней

  Returns:
      QuerySet: список объектов заданной модели
  """

  if greater_limit < 0 or less_limit < 0:
    raise ValueError("QuerySetMWRangeFilter: limits should be greater zero")
  
  if greater_limit < less_limit:
    raise ValueError("QuerySetMWRangeFilter: greater_limit should be greater than less_limit")

  return new_client.molecule.filter(molecule_properties__mw_freebase__lte=greater_limit, molecule_properties__mw_freebase__gte=less_limit) # type: ignore

In [180]:
def ExpandedFLDF(data: pd.DataFrame) -> pd.DataFrame:
  """
  ExpandedFromListsDataFrame - функция, которая переписывает словари и списки словарей 
  в таблице в отдельные столбцы

  Args:
      data (pd.DataFrame): изначальная таблица

  Returns:
      pd.DataFrame: "раскрытая" таблица
  """

  exposed_data = pd.DataFrame({
    # cross_references
    'xref_id':                     data['cross_references'].apply(lambda x: [d['xref_id'] for d in x] if x else []),
    'xref_name':                   data['cross_references'].apply(lambda x: [d['xref_name'] for d in x] if x else []),
    'xref_src':                    data['cross_references'].apply(lambda x: [d['xref_src'] for d in x] if x else []),
    # molecule_hierarchy
    'active_chembl_id':            [item['active_chembl_id'] if isinstance(item, dict) else None for item in data['molecule_hierarchy']],
    'molecule_chembl_id':          [item['molecule_chembl_id'] if isinstance(item, dict) else None for item in data['molecule_hierarchy']],
    'parent_chembl_id':            [item['parent_chembl_id'] if isinstance(item, dict) else None for item in data['molecule_hierarchy']],
    # molecule_properties
    'alogp':                       [item['alogp'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'aromatic_rings':              [item['aromatic_rings'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'cx_logd':                     [item['cx_logd'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'cx_logp':                     [item['cx_logp'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'cx_most_apka':                [item['cx_most_apka'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'cx_most_bpka':                [item['cx_most_bpka'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'full_molformula':             [item['full_molformula'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'full_mwt':                    [item['full_mwt'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'hba':                         [item['hba'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'hba_lipinski':                [item['hba_lipinski'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'hbd':                         [item['hbd'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'hbd_lipinski':                [item['hbd_lipinski'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'heavy_atoms':                 [item['heavy_atoms'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'molecular_species':           [item['molecular_species'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'mw_freebase':                 [item['mw_freebase'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'mw_monoisotopic':             [item['mw_monoisotopic'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'np_likeness_score':           [item['np_likeness_score'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'num_lipinski_ro5_violations': [item['num_lipinski_ro5_violations'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'num_ro5_violations':          [item['num_ro5_violations'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'psa':                         [item['psa'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'qed_weighted':                [item['qed_weighted'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'ro3_pass':                    [item['ro3_pass'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    'rtb':                         [item['rtb'] if isinstance(item, dict) else None for item in data['molecule_properties']],
    # molecule_structures
    'canonical_smiles':            [item['canonical_smiles'] if isinstance(item, dict) else None for item in data['molecule_structures']],
    # 'molfile':                     [item['molfile'] if isinstance(item, dict) else None for item in data['molecule_structures']],
    'standard_inchi':              [item['standard_inchi'] if isinstance(item, dict) else None for item in data['molecule_structures']],
    'standard_inchi_key':          [item['standard_inchi_key'] if isinstance(item, dict) else None for item in data['molecule_structures']],
    # molecule_synonyms
    'molecule_synonym':            data['molecule_synonyms'].apply(lambda x: [d['molecule_synonym'] for d in x] if x else []),
    'syn_type':                    data['molecule_synonyms'].apply(lambda x: [d['syn_type'] for d in x] if x else []),
    'synonyms':                    data['molecule_synonyms'].apply(lambda x: [d['synonyms'] for d in x] if x else []),
  })

  data = data.drop(['cross_references', 'molecule_hierarchy', 'molecule_properties', 'molecule_structures', 'molecule_synonyms'], axis=1)

  return pd.concat([data, exposed_data], axis=1)

  

In [182]:
under_100_mw_mols: QuerySet = QuerySetMWRangeFilter(0, 100)

len(under_100_mw_mols) # type: ignore
# print(under_100_mw_mols)

# Pandas
data = pd.DataFrame(under_100_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_100_mw_mols_output.csv", index=False)
data


# Tabulate
# print(tabulate(under_100_mw_mols, headers="keys"))
# print(type(tabulate(under_100_mw_mols, headers="keys")))

Unnamed: 0,atc_classifications,availability_type,biotherapeutic,black_box_warning,chebi_par_id,chemical_probe,chirality,dosed_ingredient,first_approval,first_in_class,...,psa,qed_weighted,ro3_pass,rtb,canonical_smiles,standard_inchi,standard_inchi_key,molecule_synonym,syn_type,synonyms
0,[],-1.0,,0,,0,-1,False,,-1,...,35.88,0.46,Y,0.0,N=C1CCCCN1,"InChI=1S/C5H10N2/c6-5-3-1-2-4-7-5/h1-4H2,(H2,6,7)",DHGUMNJVFYRSIG-UHFFFAOYSA-N,"[Piperidin-(2E)-ylideneamine, Piperidin-(2Z)-y...","[OTHER, OTHER]","[Piperidin-(2E)-ylideneamine, Piperidin-(2Z)-y..."
1,[],-1.0,,0,,0,-1,False,,-1,...,35.25,0.41,Y,3.0,CCCCON,"InChI=1S/C4H11NO/c1-2-3-4-6-5/h2-5H2,1H3",WCVVIGQKJZLJDB-UHFFFAOYSA-N,[],[],[]
2,[],-1.0,,0,,0,-1,False,,-1,...,27.03,0.39,Y,0.0,N#CN1CCC1,InChI=1S/C4H6N2/c5-4-6-2-1-3-6/h1-3H2,VEYKJLZUWWNWAL-UHFFFAOYSA-N,[Azetidine-1-Carbonitrile],[OTHER],[Azetidine-1-Carbonitrile]
3,[],-1.0,,0,104120.0,0,-1,False,,-1,...,26.30,0.33,Y,0.0,C=C1CCOC1=O,InChI=1S/C5H6O2/c1-4-2-3-7-5(4)6/h1-3H2,GSLDEZOOOSBFGP-UHFFFAOYSA-N,[],[],[]
4,[],-1.0,,0,,0,-1,False,,-1,...,27.03,0.38,Y,2.0,CCN(C#N)CC,"InChI=1S/C5H10N2/c1-3-7(4-2)5-6/h3-4H2,1-2H3",ZZTSQZQUWBFTAT-UHFFFAOYSA-N,[Diethyl-Cyanamide],[OTHER],[Diethyl-Cyanamide]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,[],-1.0,,0,,0,-1,False,,-1,...,26.02,0.45,Y,0.0,NC12CC(C1)C2,"InChI=1S/C5H9N/c6-5-1-4(2-5)3-5/h4H,1-3,6H2",UZDGSLINNQQTJM-UHFFFAOYSA-N,[],[],[]
1038,[],-1.0,,0,,0,-1,False,,-1,...,18.46,0.43,Y,0.0,C1COCOC1,InChI=1S/C4H8O2/c1-2-5-4-6-3-1/h1-4H2,VDFVNEFVBPFDSB-UHFFFAOYSA-N,[],[],[]
1039,[],,,0,,0,2,False,,0,...,,,,,[Ca+2].[Na+].[O-2].[OH-],InChI=1S/Ca.Na.H2O.O/h;;1H2;/q+2;+1;;-2/p-1,GNRFGTFXHPHBGK-UHFFFAOYSA-M,"[Carbolime, Draegersorb 800, Soda lime, Soda l...","[OTHER, OTHER, NATIONAL_FORMULARY, OTHER, OTHER]","[CARBOLIME, DRAEGERSORB 800, SODA LIME, SODA L..."
1040,[],,,0,,0,2,False,,0,...,,,,,[Ca+2].[O-2],InChI=1S/Ca.O/q+2;-2,BRPQOXSCLDDYGP-UHFFFAOYSA-N,"[Burnt lime, Calcium oxide, Calcium oxide, Cal...","[OTHER, JAN, MERCK_INDEX, OTHER, RESEARCH_CODE...","[BURNT LIME, CALCIUM OXIDE, CALCIUM OXIDE, CAL..."


In [45]:
under_200_mw_mols: QuerySet = QuerySetMWRangeFilter(100, 200)

print(len(under_200_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

43591


In [46]:
under_300_mw_mols: QuerySet = QuerySetMWRangeFilter(200, 300)

print(len(under_300_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

382494


In [47]:
under_400_mw_mols: QuerySet = QuerySetMWRangeFilter(300, 400)

print(len(under_400_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

862780


In [48]:
under_500_mw_mols: QuerySet = QuerySetMWRangeFilter(400, 500)

print(len(under_500_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

662709


In [49]:
under_600_mw_mols: QuerySet = QuerySetMWRangeFilter(500, 600)

print(len(under_600_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

256208


In [50]:
under_700_mw_mols: QuerySet = QuerySetMWRangeFilter(600, 700)

print(len(under_700_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

85072


In [51]:
under_800_mw_mols: QuerySet = QuerySetMWRangeFilter(700, 800)

print(len(under_800_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

36280


In [52]:
under_900_mw_mols: QuerySet = QuerySetMWRangeFilter(800, 900)

print(len(under_900_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

20907


In [54]:
under_1000_mw_mols: QuerySet = QuerySetMWRangeFilter(900, 1000)

print(len(under_1000_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_1000_mw_mols) # type: ignore
data

12910


KeyboardInterrupt: 

In [53]:
above_1000_mw_mols: QuerySet = QuerySetMWRangeFilter(1000)

print(len(above_1000_mw_mols)) # type: ignore

# Pandas
# data = pd.DataFrame(under_1000_mw_mols) # type: ignore
# data

49007
