In [4]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Search for FABP4 target
target = new_client.target
target_query = target.search('FABP4')
targets = pd.DataFrame.from_dict(target_query)

# Display available targets
print("Available FABP4 targets:")
print(targets[['target_chembl_id', 'target_type', 'organism', 'pref_name']])

# Select and retrieve bioactivity data for human FABP4
selected_target = targets.target_chembl_id[0]  # Usually the first entry is human FABP4
print(f"\nSelected target ID: {selected_target}")

# Get activity data
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")
df = pd.DataFrame.from_dict(res)

# Save raw data
df.to_csv('FABP4_01_bioactivity_data_raw.csv', index=False)
print(f"\nRaw data saved. Number of entries: {len(df)}")

# Handle missing data
df2 = df[df.standard_value.notna()]
df2 = df2[df.canonical_smiles.notna()]
print(f"Entries after removing missing values: {len(df2)}")

# Remove duplicates
df2_nr = df2.drop_duplicates(['canonical_smiles'])
print(f"Entries after removing duplicates: {len(df2_nr)}")

# Select relevant columns
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df3 = df2_nr[selection]

# Save preprocessed data
df3.to_csv('FABP4_02_bioactivity_data_preprocessed.csv', index=False)

# Load preprocessed data and add bioactivity classification
df4 = pd.read_csv('FABP4_02_bioactivity_data_preprocessed.csv')

# Create bioactivity threshold labels
bioactivity_threshold = []
for i in df4.standard_value:
    if float(i) >= 10000:
        bioactivity_threshold.append("inactive")
    elif float(i) <= 1000:
        bioactivity_threshold.append("active")
    else:
        bioactivity_threshold.append("intermediate")

# Add classification to dataframe
bioactivity_class = pd.Series(bioactivity_threshold, name='class')
df5 = pd.concat([df4, bioactivity_class], axis=1)

# Print distribution of classes
print("\nDistribution of bioactivity classes:")
print(df5['class'].value_counts())

# Save final curated data
df5.to_csv('FABP4_03_bioactivity_data_curated.csv', index=False)

# Create zip file containing all CSVs
! zip FABP4_bioactivity.zip FABP4_*.csv

print("\nProcess completed! Files created:")
print("1. FABP4_01_bioactivity_data_raw.csv - Raw data from ChEMBL")
print("2. FABP4_02_bioactivity_data_preprocessed.csv - Cleaned data")
print("3. FABP4_03_bioactivity_data_curated.csv - Final data with activity classes")
print("4. FABP4_bioactivity.zip - All files zipped")

Available FABP4 targets:
  target_chembl_id     target_type           organism  \
0       CHEMBL2083  SINGLE PROTEIN       Homo sapiens   
1    CHEMBL2021755  SINGLE PROTEIN  Rattus norvegicus   
2    CHEMBL1075118  SINGLE PROTEIN       Mus musculus   

                               pref_name  
0   Fatty acid binding protein adipocyte  
1  Fatty acid-binding protein, adipocyte  
2  Fatty acid-binding protein, adipocyte  

Selected target ID: CHEMBL2083

Raw data saved. Number of entries: 364
Entries after removing missing values: 342
Entries after removing duplicates: 268

Distribution of bioactivity classes:
class
intermediate    105
active           96
inactive         67
Name: count, dtype: int64
  adding: FABP4_01_bioactivity_data_raw.csv (deflated 93%)
  adding: FABP4_02_bioactivity_data_preprocessed.csv (deflated 80%)
  adding: FABP4_03_bioactivity_data_curated.csv (deflated 81%)

Process completed! Files created:
1. FABP4_01_bioactivity_data_raw.csv - Raw data from ChEMBL
2. FA

  df2 = df2[df.canonical_smiles.notna()]
