In [2]:
from chembl_webresource_client.new_client import new_client
from chembl_webresource_client.query_set import QuerySet
import pandas as pd
from os import mkdir

try:
  mkdir("results")
except:
  pass

In [3]:
df = pd.read_csv('C:/UL/Programs/C#/drugdesign_parsing/results/combined_data_from_ChEMBL.csv')
print("read: success")

df['molecule_chembl_id'] = df['molecule_chembl_id'].astype(str)
print("convert: success")

chembl_id_list: list[str] = ["CHEMBL220", "CHEMBL251", "CHEMBL229", "CHEMBL1867",
                             "CHEMBL213", "CHEMBL210", "CHEMBL1871", "CHEMBL216",
                             "CHEMBL211", "CHEMBL245", "CHEMBL218", "CHEMBL253",
                             "CHEMBL2056", "CHEMBL217", "CHEMBL252", "CHEMBL231",
                             "CHEMBL214", "CHEMBL1898", "CHEMBL224", "CHEMBL1833",
                             "CHEMBL240", "CHEMBL258", "CHEMBL1951", "CHEMBL4777",
                             "CHEMBL2034", "CHEMBL236", "CHEMBL233", "CHEMBL222", "CHEMBL228"]

filtered_df = df[df['molecule_chembl_id'].isin(chembl_id_list)]
filtered_df

  df = pd.read_csv('C:/UL/Programs/C#/drugdesign_parsing/results/combined_data_from_ChEMBL.csv')


read: success
convert: success


Unnamed: 0,atc_classifications,availability_type,biotherapeutic,black_box_warning,chebi_par_id,chemical_probe,chirality,dosed_ingredient,first_approval,first_in_class,...,psa,qed_weighted,ro3_pass,rtb,canonical_smiles,standard_inchi,standard_inchi_key,molecule_synonym,syn_type,synonyms
44324,[],-1.0,,0,,0,-1,False,,-1,...,,,,,Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)(O)O)[C@H](...,InChI=1S/C40H51N20O28P5/c41-29-17-33(49-5-45-2...,ARWCLLJIZYWWEN-ALXODACISA-N,[],[],[]


In [3]:
def QuerySetMWRangeFilter(less_limit: int = 0, greater_limit: int = 12_546_42) -> QuerySet:
    """      
    QuerySet Molecular Weight Range Filter - функция, которая выполняет фильтрацию по базе ChEMBL
    по диапазону молекулярного веса (обе границы включительно)

    Args:
        less_limit (int): нижняя граница
        greater_limit (int): верхняя граница

    Raises:
        ValueError: если границы меньше являются отриц. числами
        ValueError: если, верхняя граница больше нижней

    Returns:
        QuerySet: список объектов заданной модели
    """

    if greater_limit < 0 or less_limit < 0:
        raise ValueError(
            "QuerySetMWRangeFilter: limits should be greater zero")

    if greater_limit < less_limit:
        raise ValueError(
            "QuerySetMWRangeFilter: greater_limit should be greater than less_limit")

    
    return new_client.molecule.filter(molecule_properties__mw_freebase__lt=greater_limit, molecule_properties__mw_freebase__gte=less_limit) # type: ignore


In [4]:
def ExpandedFLDF(data: pd.DataFrame) -> pd.DataFrame:
    """
    ExpandedFromListsDataFrame - функция, которая переписывает словари и списки словарей 
    в таблице в отдельные столбцы

    Args:
        data (pd.DataFrame): изначальная таблица

    Returns:
        pd.DataFrame: "раскрытая" таблица
    """

    exposed_data = pd.DataFrame({
        #! cross_references
        'xref_id':                     data['cross_references'].apply(lambda x: [d['xref_id'] for d in x] if x else []),
        'xref_name':                   data['cross_references'].apply(lambda x: [d['xref_name'] for d in x] if x else []),
        'xref_src':                    data['cross_references'].apply(lambda x: [d['xref_src'] for d in x] if x else []),
        #! molecule_hierarchy
        'active_chembl_id':            [item['active_chembl_id'] if isinstance(item, dict) else None for item in data['molecule_hierarchy']],
        'molecule_chembl_id':          [item['molecule_chembl_id'] if isinstance(item, dict) else None for item in data['molecule_hierarchy']],
        'parent_chembl_id':            [item['parent_chembl_id'] if isinstance(item, dict) else None for item in data['molecule_hierarchy']],
        #! molecule_properties
        'alogp':                       [item['alogp'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'aromatic_rings':              [item['aromatic_rings'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'cx_logd':                     [item['cx_logd'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'cx_logp':                     [item['cx_logp'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'cx_most_apka':                [item['cx_most_apka'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'cx_most_bpka':                [item['cx_most_bpka'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'full_molformula':             [item['full_molformula'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'full_mwt':                    [item['full_mwt'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'hba':                         [item['hba'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'hba_lipinski':                [item['hba_lipinski'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'hbd':                         [item['hbd'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'hbd_lipinski':                [item['hbd_lipinski'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'heavy_atoms':                 [item['heavy_atoms'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'molecular_species':           [item['molecular_species'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'mw_freebase':                 [item['mw_freebase'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'mw_monoisotopic':             [item['mw_monoisotopic'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'np_likeness_score':           [item['np_likeness_score'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'num_lipinski_ro5_violations': [item['num_lipinski_ro5_violations'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'num_ro5_violations':          [item['num_ro5_violations'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'psa':                         [item['psa'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'qed_weighted':                [item['qed_weighted'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'ro3_pass':                    [item['ro3_pass'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        'rtb':                         [item['rtb'] if isinstance(item, dict) else None for item in data['molecule_properties']],
        #! molecule_structures
        'canonical_smiles':            [item['canonical_smiles'] if isinstance(item, dict) else None for item in data['molecule_structures']],
        # 'molfile':                   [item['molfile'] if isinstance(item, dict) else None for item in data['molecule_structures']], - какая-то стрёмная хрень с RDKit
        'standard_inchi':              [item['standard_inchi'] if isinstance(item, dict) else None for item in data['molecule_structures']],
        'standard_inchi_key':          [item['standard_inchi_key'] if isinstance(item, dict) else None for item in data['molecule_structures']],
        #! molecule_synonyms
        'molecule_synonym':            data['molecule_synonyms'].apply(lambda x: [d['molecule_synonym'] for d in x] if x else []),
        'syn_type':                    data['molecule_synonyms'].apply(lambda x: [d['syn_type'] for d in x] if x else []),
        'synonyms':                    data['molecule_synonyms'].apply(lambda x: [d['synonyms'] for d in x] if x else []),
    })

    data = data.drop(['cross_references', 'molecule_hierarchy',
                     'molecule_properties', 'molecule_structures', 'molecule_synonyms'], axis=1)

    return pd.concat([data, exposed_data], axis=1)


In [10]:
def DataAnalysisByColumns(data: pd.DataFrame) -> None:
  for column in data.columns:
      print(f"Column                     : {column}")

      # тип данных
      try:
        data_type = data[column].dtype
        print(f"Type of data               : {data_type}")

      except Exception as exception:
          print(f"Type of data:EXCEPTION     : {exception}")

      # количество ненулевых строк
      non_null_count = 0
      for value in data[column]:
          if value:
              non_null_count += 1

      print(f"Non-empty strings          : {non_null_count}")

      # наиболее часто встречающееся значение
      try:
        mode_values = data[column].mode()
        if len(mode_values) > 0:
            common_value = mode_values[0]

        else:
            common_value = "nan"

        print(f"Common value               : {common_value}")

      except Exception as exception:
          print(f"Common value:EXCEPTION     : {exception}")

      # максимальное и минимальное значения
      try:
        try:
            max_value = data[column].max()
            min_value = data[column].min()

        except TypeError:
            max_value = None
            min_value = None

            for value in data[column]:
                if value is None:
                    continue

                elif isinstance(value, (list, str)):
                      if max_value is None or len(value) > len(max_value):
                          max_value = value
                      if min_value is None or len(value) < len(min_value):
                          min_value = value

        print(f"Max value                  : {max_value}")
        print(f"Min value                  : {min_value}")

      except Exception as exception:
          print(f"Max value:EXCEPTION        : {exception}")
          print(f"Min value:EXCEPTION        : {exception}")

      print("-" * 50)
  

In [16]:
under_100_mw_mols: QuerySet = QuerySetMWRangeFilter(868, 1200)
# 12_546_42

print(len(under_100_mw_mols))  # type: ignore

# i = 868
# j = i + 8


# while(j <= 1200):
#   print(f"now: i={i}, j={j}", end="; ")
#   mw_mols = QuerySetMWRangeFilter(i, j)
#   l = len(mw_mols) # type: ignore
#   print(f"len={l}")
#   if l > 27_000:
#     if l > 40_000:
#       print("уменьши шаг") 
#       break

#     print(f"[{i}, {j}], ", end="")
#     i += j - i
#     j = i + 9
#     continue
#   j += 9


# Pandas
# data = pd.DataFrame(under_100_mw_mols)  # type: ignore
# data = ExpandedFLDF(data)
# data.to_csv("results/under_100_mw_mols_output.csv", index=False)
# data

# DataAnalysisByColumns(data)


33533


In [4]:
under_200_mw_mols: QuerySet = QuerySetMWRangeFilter(100, 200)

print(len(under_200_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_200_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_200_mw_mols_output.csv", index=False)
# data

43591


In [46]:
under_300_mw_mols: QuerySet = QuerySetMWRangeFilter(200, 300)

print(len(under_300_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_300_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_300_mw_mols_output.csv", index=False)

382494


In [7]:
under_400_mw_mols: QuerySet = QuerySetMWRangeFilter(300, 400)

print(len(under_400_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_400_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_400_mw_mols_output.csv", index=False)
# data

862780


KeyboardInterrupt: 

In [5]:
under_500_mw_mols: QuerySet = QuerySetMWRangeFilter(400, 500)

print(len(under_500_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_500_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_500_mw_mols_output.csv", index=False)
# data

662709


KeyboardInterrupt: 

In [49]:
under_600_mw_mols: QuerySet = QuerySetMWRangeFilter(500, 600)

print(len(under_600_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_600_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_600_mw_mols_output.csv", index=False)
# data

256208


In [6]:
under_700_mw_mols: QuerySet = QuerySetMWRangeFilter(600, 700)

print(len(under_700_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_700_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_700_mw_mols_output.csv", index=False)
# data

85072


In [12]:
under_800_mw_mols: QuerySet = QuerySetMWRangeFilter(700, 800)

print(len(under_800_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(under_800_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_800_mw_mols_output.csv", index=False)
# data

36280


In [11]:
under_900_mw_mols: QuerySet = QuerySetMWRangeFilter(800, 900)

print(len(under_900_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame.from_records(under_900_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_900_mw_mols_output.csv", index=False)
# data

20907


In [6]:
under_1000_mw_mols: QuerySet = QuerySetMWRangeFilter(900, 1000)

print(len(under_1000_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame.from_records(under_1000_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/under_1000_mw_mols_output.csv", index=False)
# data

12910


In [9]:
above_1000_mw_mols: QuerySet = QuerySetMWRangeFilter(1000)

print(len(above_1000_mw_mols)) # type: ignore

# Pandas
data = pd.DataFrame(above_1000_mw_mols) # type: ignore
data = ExpandedFLDF(data)
data.to_csv("results/above_1000_mw_mols_output.csv", index=False)
# data

49007
