In [62]:
from chembl_webresource_client.new_client import new_client
from chembl_webresource_client.query_set import QuerySet
import pandas as pd
from os import mkdir
from loguru import logger

try:
  mkdir("results")
except:
  pass

In [63]:
def QuerySetActivitiesByIC50(target_id: str) -> QuerySet:
    """
    Возвращает активности по target_id по IC50

    Args:
        target_id (str): идентификатор цели из базы ChEMBL

    Returns:
        QuerySet: набор активностей
    """

    return new_client.activity.filter(target_chembl_id=target_id).filter(standard_type="IC50")


def QuerySetActivitiesByKi(target_id: str) -> QuerySet:
    """
    Возвращает активности по target_id по Ki

    Args:
        target_id (str): идентификатор цели из базы ChEMBL

    Returns:
        QuerySet: набор активностей
    """

    return new_client.activity.filter(target_chembl_id=target_id).filter(standard_type="Ki")



In [64]:
# type: ignore

def MedianDedupedDF(df: pd.DataFrame, id_column_name: str, median_column_name: str) -> pd.DataFrame:
  """
  Удаляет дубликаты в колонке идентификаторов элементов DataFrame,
  заменяя их медианой соответствующих значений в колонке median_column_name.
  Сохраняет значения из всех остальных столбцов в списки, если они различны, иначе - одиночными элементами.

  Args:
      df (pd.DataFrame): исходный DataFrame
      id_column_name (str): имя колонки, в которой находятся идентификаторы
      median_column_name (str): имя колонки, в которой надо посчитать медианы

  Returns:
      pd.DataFrame: с удаленными дубликатами и списками в остальных столбцах.
  """

  def NonNoneList(l: list) -> list:
    """
    Убирает все None из списка

    Args:
        l (list): исходный список

    Returns:
        list: список без None
    """

    return list(filter(None, l))
  
  def DedupedList(l: list) -> list:
    """
    Убирает все дубликаты и None из списка

    Args:
        l (list): исходный список

    Returns:
        list: список без None и дубликатов
    """

    return list(set(NonNoneList(l)))

  median_id_data: dict = {}
  
  # значения в столбце, где будут медианы - должно быть типа float
  df[median_column_name] = df[median_column_name].astype(float)

  for name in df[id_column_name].unique():
    name_subset_df: pd.DataFrame = df.loc[df[id_column_name] == name]
    # создаем словарь для хранения данных по данному имени
    name_values_dict = {median_column_name: name_subset_df[median_column_name].median()}

    # добавляем списки значений для остальных столбцов
    for col in name_subset_df.columns:
      # исключаем колонку median_column_name
      if col != median_column_name and col != id_column_name:
        try:  
          name_values_dict[col] = DedupedList(name_subset_df[col].tolist())

        except TypeError:
          name_values_dict[col] = NonNoneList(name_subset_df[col].tolist())

        if len(name_values_dict[col]) == 1:
           name_values_dict[col] = name_values_dict[col][0]
        
        if len(name_values_dict[col]) == 0:
           name_values_dict[col] = None

    # сохраняем данные для данного имени
    median_id_data[name] = name_values_dict

  # создаем новый DataFrame с уникальными значениями id_column_name и соответствующими данными
  new_df = pd.DataFrame.from_dict(median_id_data, orient='index').reset_index()
  new_df.rename(columns={'index': id_column_name}, inplace=True)

  return new_df

In [65]:
def CleanedActivitiesDF(data: pd.DataFrame, target_id: str, activities_type: str,
                        print_to_console: bool = False) -> pd.DataFrame:
    """
    Производит чистку выборки activities конкретной цели по IC50 и Ki

    Args:
        data (pd.DataFrame): выборка activities
        target_id (str): идентификатор цели
        activities_type (str): IC50 или Ki

    Returns:
        pd.DataFrame: очищенная выборка
    """

    if print_to_console:
        logger.info(f"Start cleaning {activities_type} activities DataFrame from {
            target_id}...".ljust(77))

        logger.info(f"Deleting useless columns...".ljust(77))

    try:
        data = data.drop(['activity_id', 'activity_properties',
                          'document_journal', 'document_year',
                          'molecule_pref_name', 'pchembl_value',
                          'potential_duplicate', 'qudt_units',
                          'record_id', 'src_id', 'standard_flag',
                          'standard_text_value', 'standard_upper_value',
                          'target_chembl_id', 'target_pref_name',
                          'target_tax_id', 'text_value', 'toid',
                          'type', 'units', 'uo_units', 'upper_value',
                          'value'], axis=1)

        if print_to_console:
            logger.success(f"Deleting useless columns: SUCCESS".ljust(77))

            logger.info(f"Deleting inappropriate elements...".ljust(77))

        data = data[data['relation'] == '=']
        data = data[data['standard_units'] == 'nM']
        data = data[data['target_organism'] == "Homo sapiens"]
        data = data[data['standard_type'].isin(['IC50', 'Ki'])]

        data['standard_value'] = data['standard_value'].astype(float)
        data = data[data['standard_value'] <= 1000000000]

        data['activity_comment'] = data['activity_comment'].replace(
            "Not Determined", None)

        data = data.drop(['target_organism', 'standard_type'], axis=1)

        if print_to_console:
            logger.success(
                f"Deleting inappropriate elements: SUCCESS".ljust(77))

            logger.info(
                f"Calculating median for 'standard value'...".ljust(77))

        data = MedianDedupedDF(data, "molecule_chembl_id", "standard_value")

        if print_to_console:
            logger.success(
                f"Calculating median for 'standard value': SUCCESS".ljust(77))

            logger.info(
                f"Reindexing columns in logical order...".ljust(77))

        data = data.reindex(columns=["molecule_chembl_id", "parent_molecule_chembl_id",
                                     "canonical_smiles", "document_chembl_id", "standard_relation",
                                     "relation", "standard_value", "standard_units", 'assay_chembl_id',
                                     'assay_description', 'assay_type', 'assay_variant_accession',
                                     'assay_variant_mutation', 'action_type', 'activity_comment',
                                     'data_validity_comment', 'data_validity_description',
                                     'bao_endpoint', 'bao_format', 'bao_label', 'ligand_efficiency'])

        if print_to_console:
            logger.success(
                f"Reindexing columns in logical order: SUCCESS".ljust(77))

            logger.success(f"End cleaning activities DataFrame from {
                target_id}".ljust(77))

    except Exception as exception:
        logger.error(f"{exception}".ljust(77))

    if print_to_console:
        logger.info(f"{' ' * 77}")

    return data

In [66]:
# type: ignore
target_id = "CHEMBL229"

activities_ic50: QuerySet = QuerySetActivitiesByKi(target_id)
data_frame_ic50 = CleanedActivitiesDF(pd.DataFrame(
                activities_ic50), target_id=target_id, activities_type="Ki", print_to_console=True)

df = data_frame_ic50
df

[32m2024-10-13 20:26:17.604[0m | [1mINFO    [0m | [36m__main__[0m:[36mCleanedActivitiesDF[0m:[36m16[0m - [1mStart cleaning Ki activities DataFrame from CHEMBL229...                     [0m
[32m2024-10-13 20:26:17.604[0m | [1mINFO    [0m | [36m__main__[0m:[36mCleanedActivitiesDF[0m:[36m19[0m - [1mDeleting useless columns...                                                  [0m
[32m2024-10-13 20:26:17.604[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mCleanedActivitiesDF[0m:[36m34[0m - [32m[1mDeleting useless columns: SUCCESS                                            [0m
[32m2024-10-13 20:26:17.604[0m | [1mINFO    [0m | [36m__main__[0m:[36mCleanedActivitiesDF[0m:[36m36[0m - [1mDeleting inappropriate elements...                                           [0m
[32m2024-10-13 20:26:17.617[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mCleanedActivitiesDF[0m:[36m52[0m - [32m[1mDeleting inappropriate elements: SUCCESS                 

Unnamed: 0,molecule_chembl_id,parent_molecule_chembl_id,canonical_smiles,document_chembl_id,standard_relation,relation,standard_value,standard_units,assay_chembl_id,assay_description,...,assay_variant_accession,assay_variant_mutation,action_type,activity_comment,data_validity_comment,data_validity_description,bao_endpoint,bao_format,bao_label,ligand_efficiency
0,CHEMBL836,CHEMBL836,CCOc1ccccc1OCCN[C@H](C)Cc1ccc(OC)c(S(N)(=O)=O)c1,"[CHEMBL1129998, CHEMBL1133729, CHEMBL1144284, ...",=,=,0.190,nM,"[CHEMBL652814, CHEMBL894516, CHEMBL947045, CHE...",[Binding affinity towards cloned human alpha-1...,...,,,,,,,BAO_0000192,"[BAO_0000221, BAO_0000219, BAO_0000357]","[single protein format, tissue-based format, c...","[{'bei': '25.79', 'le': '0.51', 'lle': '8.20',..."
1,CHEMBL101739,CHEMBL101739,c1ccc2c(c1)CCC1(CCN(CCc3cccc4ccccc34)CC1)O2,CHEMBL1132031,=,=,56.000,nM,CHEMBL646166,Displacement of [125I]HEAT from human Alpha-1C...,...,,,,,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '20.28', 'le': '0.37', 'lle': '2.01', ..."
2,CHEMBL101579,CHEMBL101579,Cn1cnc(S(=O)(=O)Nc2ccc3c(c2)C(=O)CC2(CCN(CCc4c...,CHEMBL1132031,=,=,8.000,nM,CHEMBL646166,Displacement of [125I]HEAT from human Alpha-1C...,...,,,,,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '15.26', 'le': '0.29', 'lle': '3.68', ..."
3,CHEMBL101610,CHEMBL101610,Cc1noc(C)c1S(=O)(=O)Nc1ccc2c(c1)C(=O)CC1(CCN(C...,CHEMBL1132031,=,=,5.345,nM,"[CHEMBL652816, CHEMBL652810]",[Ability to displace [125I]HEAT from cloned hu...,...,,,,,,,BAO_0000192,"[BAO_0000219, BAO_0000357]","[single protein format, cell-based format]","[{'bei': '16.79', 'le': '0.32', 'lle': '3.87',..."
4,CHEMBL419164,CHEMBL419164,CS(=O)(=O)Nc1ccc2c(c1)OC1(CCN(CCc3ccccn3)CC1)C...,CHEMBL1132031,=,=,1670.000,nM,CHEMBL646166,Displacement of [125I]HEAT from human Alpha-1C...,...,,,,,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '13.90', 'le': '0.27', 'lle': '3.28', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,CHEMBL5291469,CHEMBL5315702,O=C(NCCN1CCC[C@@H]1Cn1nc(Cc2ccc(Cl)cc2)c2ccccc...,CHEMBL5260808,=,=,199.530,nM,CHEMBL5261563,Antagonist activity against recombinant human ...,...,,,"{'action_type': 'ANTAGONIST', 'description': '...",,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '13.16', 'le': '0.25', 'lle': '3.05', ..."
1373,CHEMBL5284242,CHEMBL5315342,COCCCC(=O)N(C)CCN1CCC[C@@H]1Cn1nc(Cc2ccc(Cl)cc...,CHEMBL5260808,=,=,31.620,nM,CHEMBL5261563,Antagonist activity against recombinant human ...,...,,,"{'action_type': 'ANTAGONIST', 'description': '...",,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '14.68', 'le': '0.28', 'lle': '3.51', ..."
1374,CHEMBL5272464,CHEMBL5315453,CCCNC(=O)CCCN1CCC[C@@H]1Cn1nc(Cc2ccc(Cl)cc2)c2...,CHEMBL5260808,=,=,10.000,nM,CHEMBL5261563,Antagonist activity against recombinant human ...,...,,,"{'action_type': 'ANTAGONIST', 'description': '...",,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '16.63', 'le': '0.32', 'lle': '3.59', ..."
1375,CHEMBL5272034,CHEMBL5315277,COCCNC(=O)CCCN1CCC[C@@H]1Cn1nc(Cc2ccc(Cl)cc2)c...,CHEMBL5260808,=,=,6.310,nM,CHEMBL5261563,Antagonist activity against recombinant human ...,...,,,"{'action_type': 'ANTAGONIST', 'description': '...",,,,BAO_0000192,BAO_0000219,cell-based format,"{'bei': '16.50', 'le': '0.32', 'lle': '4.55', ..."


In [67]:
df.columns

Index(['molecule_chembl_id', 'parent_molecule_chembl_id', 'canonical_smiles',
       'document_chembl_id', 'standard_relation', 'relation', 'standard_value',
       'standard_units', 'assay_chembl_id', 'assay_description', 'assay_type',
       'assay_variant_accession', 'assay_variant_mutation', 'action_type',
       'activity_comment', 'data_validity_comment',
       'data_validity_description', 'bao_endpoint', 'bao_format', 'bao_label',
       'ligand_efficiency'],
      dtype='object')