In [2]:
import numpy as np
import pandas as pd
import os
from tdc.multi_pred import DrugRes

In [3]:
# Load the GDSC dataset from TDC using a valid name
data = DrugRes(name = 'gdsc1')

# Get the entire dataset as a pandas dataframe
gdsc1_df = data.get_data()

# Display the dataframe to see its structure
print("Dataset loaded successfully!")
gdsc1_df.head()

Found local copy...
Loading...
Done!


Dataset loaded successfully!


Unnamed: 0,Drug_ID,Drug,Cell Line_ID,Cell Line,Y
0,Erlotinib,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...,MC-CAR,"[3.23827250519154, 2.98225419469807, 10.235490...",2.395685
1,Erlotinib,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...,ES3,"[8.690197905033282, 3.0914731119366, 9.9924871...",3.140923
2,Erlotinib,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...,ES5,"[8.233101127037282, 2.82468731112752, 10.01588...",3.968757
3,Erlotinib,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...,ES7,"[8.33346622426757, 3.9667571228514302, 9.79399...",2.692768
4,Erlotinib,COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...,EW-11,"[8.39134072442845, 2.9683601858810698, 10.2606...",2.478678


In [4]:
# Method 1: Check what attributes are available
print("Available attributes:")
print([attr for attr in dir(data) if not attr.startswith('_')])

# Method 2: Try to get gene names from the label_name attribute
try:
    gene_labels = data.label_name
    print(f"\nGene labels from label_name: {gene_labels[:10]}")
except:
    print("\nlabel_name not available")

# Method 3: Get gene names from the auxiliary info
try:
    gene_info = data.get_gene_names()
    print(f"\nGene names: {gene_info[:10]}")
except:
    print("\nget_gene_names() not available")

# Method 4: Check the shape and see if we can infer gene count
# Extract one cell line's gene expression to see the structure
sample_expression = gdsc1_df['Y'].iloc[0]
if isinstance(sample_expression, (list, np.ndarray)):
    print(f"\nNumber of features in each cell line: {len(sample_expression)}")
    print(f"First 5 values: {sample_expression[:5]}")

Available attributes:
['aux_column', 'aux_column_val', 'balanced', 'binarize', 'convert_from_log', 'convert_to_log', 'entity1', 'entity1_idx', 'entity1_name', 'entity2', 'entity2_idx', 'entity2_name', 'file_format', 'get_data', 'get_gene_symbols', 'get_label_meaning', 'get_split', 'label_distribution', 'label_name', 'log_flag', 'name', 'neg_sample', 'path', 'print_stats', 'raw_y', 'to_graph', 'two_types', 'y']

label_name not available

get_gene_names() not available


In [5]:
# Get the gene symbols
gene_labels = data.get_gene_symbols()
print(f"Total number of genes: {len(gene_labels)}")
print(f"\nFirst 20 gene symbols:")
print(gene_labels[:20])

Found local copy...
Loading...


Total number of genes: 17737

First 20 gene symbols:
['TSPAN6' 'TNMD' 'DPM1' 'SCYL3' 'C1orf112' 'FGR' 'CFH' 'FUCA2' 'GCLC'
 'NFYA' 'STPG1' 'NIPAL3' 'LAS1L' 'ENPP4' 'SEMA3F' 'CFTR' 'ANKIB1' 'KRIT1'
 'RAD52' 'BAD']


In [6]:
type(gene_labels)

numpy.ndarray

In [7]:
gene_labels.shape

(17737,)

In [9]:
# Check the dataframe columns to find where gene expression is stored
print(f"\nDataFrame columns: {gdsc1_df.columns.tolist()}")
print(f"\nDataFrame shape: {gdsc1_df.shape}")
print(f"\nFirst row:")
print(gdsc1_df.iloc[0])

# The gene expression should be in the 'Cell Line' column based on your screenshot
sample_expression = gdsc1_df['Cell Line'].iloc[0]
print(f"\nType of Cell Line data: {type(sample_expression)}")

if isinstance(sample_expression, (list, np.ndarray)):
    print(f"Number of features in each cell line: {len(sample_expression)}")
    print(f"Number of gene labels: {len(gene_labels)}")
    print(f"Do dimensions match? {len(sample_expression) == len(gene_labels)}")
    
    # Show example
    print(f"\nExample - First cell line ({gdsc1_df['Cell Line_ID'].iloc[0]}):")
    print(f"Drug: {gdsc1_df['Drug_ID'].iloc[0]}")
    print(f"IC50 (target): {gdsc1_df['Y'].iloc[0]}")
    
    # Create a mapping of gene to expression value
    gene_expression_dict = dict(zip(gene_labels, sample_expression))
    print(f"\nFirst 5 gene expression values with labels:")
    for i, (gene, value) in enumerate(list(gene_expression_dict.items())[:5]):
        print(f"  {gene}: {value:.4f}")
else:
    print(f"Sample expression value: {sample_expression}")
    print("\nLet me check if it's stored differently...")



DataFrame columns: ['Drug_ID', 'Drug', 'Cell Line_ID', 'Cell Line', 'Y']

DataFrame shape: (177310, 5)

First row:
Drug_ID                                                 Erlotinib
Drug            COCCOC1=C(C=C2C(=C1)C(=NC=N2)NC3=CC=CC(=C3)C#C...
Cell Line_ID                                               MC-CAR
Cell Line       [3.23827250519154, 2.98225419469807, 10.235490...
Y                                                        2.395685
Name: 0, dtype: object

Type of Cell Line data: <class 'numpy.ndarray'>
Number of features in each cell line: 17737
Number of gene labels: 17737
Do dimensions match? True

Example - First cell line (MC-CAR):
Drug: Erlotinib
IC50 (target): 2.395685

First 5 gene expression values with labels:
  TSPAN6: 3.2383
  TNMD: 2.9823
  DPM1: 10.2355
  SCYL3: 4.8561
  C1orf112: 4.0789


In [10]:
type(gdsc1_df)

pandas.core.frame.DataFrame

In [11]:
gdsc1_df.columns.to_list()

['Drug_ID', 'Drug', 'Cell Line_ID', 'Cell Line', 'Y']

In [None]:
gene_expression_df = pd.DataFrame(
    gdsc1_df['Cell Line'].to_list(), 
    columns=gene_labels,
    dtype=np.float32  # Use 32-bit floats instead of 64-bit
)

In [None]:
gene_expression_df.head()

In [12]:
gdsc1_df.shape

(177310, 5)

In [None]:
# Extract the metadata columns from the original dataframe
gdsc1_meta_df = gdsc1_df[['Drug_ID', 'Cell Line_ID', 'Y']]

# Concatenate the metadata with the new gene expression dataframe
gdsc1_processed_df = pd.concat([gdsc1_meta_df, gene_expression_df], axis=1)

print(f"Shape of the gdsc1_processed_df dataframe: {gdsc1_processed_df.shape}")
gdsc1_processed_df.head()