## Explore NCI-60 data

There are two screens:

- Primary screen
- Secondary screen

In [1]:
import pathlib
import sys

import pandas as pd

sys.path.append("../")
from utils import load_utils

In [2]:
# Load NCI60 data (primary screen)
top_dir = ".."
data_dir = "nci60/data"

nci60_df, nci60_trt_df = load_utils.load_nci60(
    top_dir=top_dir, data_dir=data_dir, load_treatment_info=True
)

print(nci60_df.shape)
nci60_df.head(3)

(23485461, 18)


Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT_GIPRCNT,AVERAGE_GIPRCNT,STDDEV_GIPRCNT,COUNT_PTC,AVERAGE_PTC,STDDEV_PTC
0,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,1,29,Non-Small Cell Lung Cancer,HOP-92,LNS,1,-45.9064,0.0,1,22.4242,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,12,5,CNS Cancer,SNB-75,CNS,1,-65.7534,0.0,1,14.5068,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,1,Colon Cancer,HT29,COL,1,-68.2635,0.0,1,6.8123,0.0


In [3]:
print(nci60_trt_df.shape)
nci60_trt_df.head(3)

(251887, 3)


Unnamed: 0,nsc_number,cpd_name,cpd_name_type
0,1,Tolylquinone,Chemical Name
1,1,p-Toluquinone,Chemical Name
2,1,Methylquinone,Chemical Name


In [4]:
# How many unique cell lines?
print(nci60_df.nunique())
nci60_df.CELL_NAME.value_counts().head(20)

RELEASE_DATE                 10
EXPID                      5118
PREFIX                        1
NSC                       57229
CONCENTRATION_UNIT            3
LOG_HI_CONCENTRATION        323
CONCENTRATION              1204
PANEL_NUMBER                 15
CELL_NUMBER                  40
PANEL_NAME                   15
CELL_NAME                   163
PANEL_CODE                   15
COUNT_GIPRCNT                 2
AVERAGE_GIPRCNT         1781296
STDDEV_GIPRCNT              526
COUNT_PTC                     3
AVERAGE_PTC             1144707
STDDEV_PTC                  813
dtype: int64


CELL_NAME
A549/ATCC    412043
OVCAR-8      410328
SW-620       410009
U251         409809
NCI-H23      407139
KM12         405924
SN12C        405864
HCT-15       405298
HCT-116      405071
UO-31        405039
SNB-19       404504
SF-268       404072
SK-MEL-28    403912
IGROV1       403438
OVCAR-5      403338
SF-295       403187
HT29         403169
COLO 205     402827
MOLT-4       402171
NCI-H460     399936
Name: count, dtype: int64

In [5]:
# How many unique compounds?
print(nci60_trt_df.nsc_number.nunique())

67534


In [6]:
# Compounds are annotated to different name types
# Print different examples of cpd name types
nci60_trt_df.cpd_name_type.value_counts()

cpd_name_type
Chemical Name      196330
9th C.I.            28024
8th C.I.            21247
VAN                  2001
USAN                 1811
German Name           635
Czech Name            503
French Name           325
Italian Name          275
Dutch Name            231
DOT Name              163
Brand Name            118
Polish Name           104
Russian Name           41
7th C.I.               23
INN:BAN                18
6th C.I.               13
Spanish Name           11
Roumanian Name          4
Hindustan Name          4
Swedish Name            2
Portuguese Name         1
Belgian Name            1
Persian Name            1
Japanese Name           1
Name: count, dtype: int64

In [7]:
# How many doses?
# (Note: -log10 value)
nci60_df.CONCENTRATION.value_counts()

CONCENTRATION
-6.0000     3894859
-7.0000     3893495
-8.0000     3873435
-5.0000     3862337
-4.0000     3728824
             ...   
-12.4881          4
-1.1487           4
-0.1487           4
-2.1487           4
-10.4881          3
Name: count, Length: 1204, dtype: int64