In [19]:
from chembl_webresource_client.new_client import new_client
from chembl_webresource_client.query_set import QuerySet
import pandas as pd
from os import mkdir

try:
  mkdir("results")
except:
  pass

In [20]:
def QuerySetTargets() -> QuerySet:
    """      
    QuerySetTargets

    Returns:
        QuerySet: список целей
    """

    return new_client.target.filter() # type: ignore



Связь целей и активностей (?)

In [27]:
targs: QuerySet = QuerySetTargets()
targs_ids = new_client.target.filter().only('target_chembl_id')[0]                               # type: ignore
activities = new_client.activity.filter(target_chembl_id=targs_ids['target_chembl_id']).filter() # type: ignore

print(len(targs))      # type: ignore
print(len(activities)) # type: ignore

# Pandas
# data = pd.DataFrame(targs) # type: ignore
# data = ExpandedFLDF(data)
# data.to_csv("results/targs_output.csv", index=False)

15598
654


Это всё разное!

In [24]:
print(len(new_client.target.filter()))    # type: ignore
print(len(new_client.molecule.filter()))  # type: ignore
print(len(new_client.activity.filter()))  # type: ignore

15598
2431025
20772701


In [66]:
def ExpandedFLDF(data: pd.DataFrame) -> pd.DataFrame:
    """
    ExpandedFromListsDataFrame - функция, которая переписывает словари и списки словарей 
    в таблице в отдельные столбцы

    Args:
        data (pd.DataFrame): изначальная таблица

    Returns:
        pd.DataFrame: "раскрытая" таблица
    """

    def ExtractedValuesFromColumn(df: pd.DataFrame, column_name: str, key: str) -> pd.Series:
        return df[column_name].apply(lambda x: [d[key] for d in x] if x else [])

    exposed_data = pd.DataFrame({
        #! cross_references
        'xref_id':                                ExtractedValuesFromColumn(data, 'cross_references', 'xref_id'),
        'xref_name':                              ExtractedValuesFromColumn(data, 'cross_references', 'xref_name'),
        'xref_src':                               ExtractedValuesFromColumn(data, 'cross_references', 'xref_src'),
    })

    # избавлюсь от списков, так как в них находятся одиночные словари
    data['target_components'] = data['target_components'].apply(lambda x: x[0])

    narrowed_data = pd.DataFrame(data['target_components'].values.tolist())

    exposed_narrowed_data = pd.DataFrame({
        #! target_component_synonyms
        'component_synonym':                      ExtractedValuesFromColumn(narrowed_data, 'target_component_synonyms', 'component_synonym'),
        'syn_type':                               ExtractedValuesFromColumn(narrowed_data, 'target_component_synonyms', 'syn_type'),
        #! target_component_xrefs
        'xref_id_target_component_xrefs':         ExtractedValuesFromColumn(narrowed_data, 'target_component_xrefs', 'xref_id'),
        'xref_name_target_component_xrefs':       ExtractedValuesFromColumn(narrowed_data, 'target_component_xrefs', 'xref_name'),
        'xref_src_db_target_component_xrefs':     ExtractedValuesFromColumn(narrowed_data, 'target_component_xrefs', 'xref_src_db'),
    })

    narrowed_data = narrowed_data.drop(['target_component_synonyms', 'target_component_xrefs'], axis=1)
    narrowed_data = pd.concat([narrowed_data, exposed_narrowed_data], axis=1)

    data = data.drop(['cross_references', 'target_components'], axis=1)
    return pd.concat([data, exposed_data, narrowed_data], axis=1)


Тестовая проверка на небольшом кол-ве целей

In [67]:
uniprot_id = 'P35916' #Vascular endothelial growth factor receptor 3; Uniprot accession P35916
res = new_client.target.filter(target_components__accession=uniprot_id) # type: ignore

data = ExpandedFLDF(pd.DataFrame(res))

data.to_csv("results/res_by_uniprot_id.csv", index=False)
data

Unnamed: 0,organism,pref_name,species_group_flag,target_chembl_id,target_type,tax_id,xref_id,xref_name,xref_src,accession,component_description,component_id,component_type,relationship,component_synonym,syn_type,xref_id_target_component_xrefs,xref_name_target_component_xrefs,xref_src_db_target_component_xrefs
0,Homo sapiens,Vascular endothelial growth factor receptor 3,False,CHEMBL1955,SINGLE PROTEIN,9606,[FLT4],[None],[Wikipedia],P35916,Vascular endothelial growth factor receptor 3,279,PROTEIN,SINGLE PROTEIN,"[2.7.10.1, FLT4, FLT-4, Fms-like tyrosine kina...","[EC_NUMBER, GENE_SYMBOL, UNIPROT, UNIPROT, UNI...","[FLT4, P35916, GO:0005576, GO:0005634, GO:0005...","[Lymphedema, hereditary I (Milory disease), No...","[CGD, ExpressionAtlas, GoComponent, GoComponen..."
1,Homo sapiens,Vascular endothelial growth factor receptor,False,CHEMBL2095227,PROTEIN FAMILY,9606,[],[],[],P17948,Vascular endothelial growth factor receptor 1,181,PROTEIN,GROUP MEMBER,"[2.7.10.1, FLT, FLT, FLT1, FLT-1, Fms-like tyr...","[EC_NUMBER, GENE_SYMBOL_OTHER, UNIPROT, GENE_S...","[P17948, GO:0005576, GO:0005615, GO:0005737, G...","[None, extracellular region, extracellular spa...","[ExpressionAtlas, GoComponent, GoComponent, Go..."
2,Homo sapiens,Vascular endothelial growth factor receptor 2 ...,False,CHEMBL2111409,SELECTIVITY GROUP,9606,[],[],[],P35916,Vascular endothelial growth factor receptor 3,279,PROTEIN,COMPARATIVE PROTEIN,"[2.7.10.1, FLT4, FLT-4, Fms-like tyrosine kina...","[EC_NUMBER, GENE_SYMBOL, UNIPROT, UNIPROT, UNI...","[FLT4, P35916, GO:0005576, GO:0005634, GO:0005...","[Lymphedema, hereditary I (Milory disease), No...","[CGD, ExpressionAtlas, GoComponent, GoComponen..."
3,Homo sapiens,Focal adhesion kinase 1/vascular endothelial g...,False,CHEMBL3301389,PROTEIN-PROTEIN INTERACTION,9606,[],[],[],P35916,Vascular endothelial growth factor receptor 3,279,PROTEIN,INTERACTING PROTEIN,"[2.7.10.1, FLT4, FLT-4, Fms-like tyrosine kina...","[EC_NUMBER, GENE_SYMBOL, UNIPROT, UNIPROT, UNI...","[FLT4, P35916, GO:0005576, GO:0005634, GO:0005...","[Lymphedema, hereditary I (Milory disease), No...","[CGD, ExpressionAtlas, GoComponent, GoComponen..."
