## Explore NCI-60 data

There are two screens:

- Primary screen
- Secondary screen

In [1]:
import pathlib
import sys

import pandas as pd

sys.path.append("../")
from utils import load_utils

In [2]:
# Load NCI60 data (primary screen)
top_dir = ".."
data_dir = "nci60/data"

nci60_df, nci60_trt_df = load_utils.load_nci60(
    top_dir=top_dir, data_dir=data_dir, load_treatment_info=True
)

print(nci60_df.shape)
nci60_df.head(3)

(23410399, 18)


Unnamed: 0,RELEASE_DATE,EXPID,PREFIX,NSC,CONCENTRATION_UNIT,LOG_HI_CONCENTRATION,CONCENTRATION,PANEL_NUMBER,CELL_NUMBER,PANEL_NAME,CELL_NAME,PANEL_CODE,COUNT_GIPRCNT,AVERAGE_GIPRCNT,STDDEV_GIPRCNT,COUNT_PTC,AVERAGE_PTC,STDDEV_PTC
0,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,10,8,Melanoma,SK-MEL-28,MEL,1,-92.4658,0.0,1,2.8109,0.0
1,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,10,Colon Cancer,COLO 205,COL,1,-94.3548,0.0,1,1.634,0.0
2,20210223,0001MD02,S,123127,M,-4.6021,-4.6021,4,15,Colon Cancer,HCT-15,COL,1,-25.8621,0.0,1,12.6347,0.0


In [3]:
print(nci60_trt_df.shape)
nci60_trt_df.head(3)

(251887, 3)


Unnamed: 0,nsc_number,cpd_name,cpd_name_type
0,1,Tolylquinone,Chemical Name
1,1,p-Toluquinone,Chemical Name
2,1,Methylquinone,Chemical Name


In [4]:
# How many unique cell lines?
print(nci60_df.nunique())
nci60_df.CELL_NAME.value_counts().head(20)

RELEASE_DATE                  9
EXPID                      5108
PREFIX                        1
NSC                       57053
CONCENTRATION_UNIT            3
LOG_HI_CONCENTRATION        322
CONCENTRATION              1203
PANEL_NUMBER                 15
CELL_NUMBER                  40
PANEL_NAME                   15
CELL_NAME                   163
PANEL_CODE                   15
COUNT_GIPRCNT                 2
AVERAGE_GIPRCNT         1777446
STDDEV_GIPRCNT              526
COUNT_PTC                     3
AVERAGE_PTC             1143744
STDDEV_PTC                  813
dtype: int64


A549/ATCC    410783
OVCAR-8      409068
SW-620       408749
U251         408549
NCI-H23      405879
KM12         404664
SN12C        404609
HCT-15       404038
HCT-116      403811
UO-31        403779
SNB-19       403244
SF-268       402812
SK-MEL-28    402652
IGROV1       402178
OVCAR-5      402078
SF-295       401927
HT29         401909
COLO 205     401567
MOLT-4       400931
NCI-H460     398676
Name: CELL_NAME, dtype: int64

In [5]:
# How many unique compounds?
print(nci60_trt_df.nsc_number.nunique())

67534


In [6]:
# Compounds are annotated to different name types
# Print different examples of cpd name types
nci60_trt_df.cpd_name_type.value_counts()

Chemical Name      196330
9th C.I.            28024
8th C.I.            21247
VAN                  2001
USAN                 1811
German Name           635
Czech Name            503
French Name           325
Italian Name          275
Dutch Name            231
DOT Name              163
Brand Name            118
Polish Name           104
Russian Name           41
7th C.I.               23
INN:BAN                18
6th C.I.               13
Spanish Name           11
Roumanian Name          4
Hindustan Name          4
Swedish Name            2
Portuguese Name         1
Belgian Name            1
Persian Name            1
Japanese Name           1
Name: cpd_name_type, dtype: int64

In [7]:
# How many doses?
# (Note: -log10 value)
nci60_df.CONCENTRATION.value_counts()

-6.0000     3882236
-7.0000     3880872
-8.0000     3860813
-5.0000     3849714
-4.0000     3716201
             ...   
-3.1487           4
-0.1487           4
-2.1487           4
-11.4881          4
-10.4881          3
Name: CONCENTRATION, Length: 1203, dtype: int64