# 00_download.ipynb

This notebook fetches the **core GDSC data files** needed for the capstone project.

It will create the folder structure `data/raw/` relative to the repository root and download:

- Drug‐response curves (IC50 / AUC)
- Compound annotations (targets, SMILES)
- Cell‑line metadata (tissue, sample provenance)

❗ **Large omics matrices** (RNA‑seq, DNA‑methylation, somatic mutations) are *not* downloaded automatically because
they are multi‑GB files.  Instead, shell snippets to fetch them from the Sanger FTP mirror are provided—you can
uncomment/run those if you want the notebook to pull everything for you.


# 0: Load Modulues, define functions and setup script configuration variables

In [1]:
import os

from pathlib import Path
import subprocess, pathlib, textwrap
from pathlib import Path
import urllib.request
import ssl
from ftplib import FTP

import pandas as pd
import polars as pl, pyarrow # required for polars file reading

import warnings

warnings.filterwarnings('ignore')

### Define Definitinons

In [None]:
def download(url: str, dest: Path):
    if dest.exists():
        print(f'{dest.name} already exists, skipping.')
        return
    print(f'Downloading {dest.name} …')
    dest.parent.mkdir(parents=True, exist_ok=True)
    urllib.request.urlretrieve(url, dest)
    print('✔ Done')

def read_parque_to_pd_df(file_path):
    "read parque with polars and then convert to pandas for speed"
    if file_path.exists():
        dl_out = pl.read_parquet(file_path)
        print("Loaded dataframe shape:",dl_out.shape)
        df_out = dl_out.to_pandas()
        return df_out
    else:
        print(f"File not found: {file_path}\nPlease run the methylation ingestion and transpose step first.")
        return None
    


## Setup working directories

In [None]:
# Allow TLSv1.2 (required by some older endpoints)
ssl._create_default_https_context = ssl._create_unverified_context

ROOT = Path.cwd().resolve()  # assumes notebook is run from repo root or subdir
if ROOT.name == 'notebooks':
    ROOT = ROOT.parent  # go up one level if in notebooks directory

DATA_RAW = ROOT / 'data' / 'raw'
DATA_RAW.mkdir(parents=True, exist_ok=True)

print(f'Download directory: {DATA_RAW}')

Download directory: /Users/brianr/repos/UCBAIML-GDSC/data/raw


## Setup configs for the script

In [None]:
# process the raw methylation geo file to a data frame WITHOUT COSMID_ID
run_meth_ingest_and_transpose = True

meth_df_cosmicid_path = DATA_RAW / "methylation/meth_df_cosmic_ids.parquet"

# 1: Download Files

## 1.1: Download GDSC2 (ic50), cell line details, drug data files


In [5]:
files = {
    'GDSC2_fitted_dose_response_27Oct23.xlsx':
        'https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/GDSC2_fitted_dose_response_27Oct23.xlsx',
    'Compounds_Annotation.xlsx':
        'https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/screened_compounds_rel_8.5.csv',
    'Cell_Lines_Details.xlsx':
        'https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/Cell_Lines_Details.xlsx',
}

files

{'GDSC2_fitted_dose_response_27Oct23.xlsx': 'https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/GDSC2_fitted_dose_response_27Oct23.xlsx',
 'Compounds_Annotation.xlsx': 'https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/screened_compounds_rel_8.5.csv',
 'Cell_Lines_Details.xlsx': 'https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.5/Cell_Lines_Details.xlsx'}

In [None]:
# download files
for fname, url in files.items():
    download(url, DATA_RAW / fname)

GDSC2_fitted_dose_response_27Oct23.xlsx already exists, skipping.
Compounds_Annotation.xlsx already exists, skipping.
Cell_Lines_Details.xlsx already exists, skipping.


## 1.2: Downloading (GSE) large omics matrices


In [7]:
## Download methylation data
dest = pathlib.Path("data/raw/methylation")
dest.mkdir(parents=True, exist_ok=True)

cmd = textwrap.dedent(f"""
    wget -c \\
         https://ftp.ncbi.nlm.nih.gov/geo/series/GSE68nnn/GSE68379/suppl/GSE68379_Matrix.processed.txt.gz \\
         -P {dest}
""").split()
local = Path(dest) / Path('GSE68379_Matrix.processed.txt.gz')
if not local.exists():
    subprocess.run(cmd, check=True)
else:
    print('File already exists!')

File already exists!


## 1.3: Download (FTP) Variant and Gene Expressin data sets

In [8]:
import time

# FTP download of variant and gene expression data sets
#-- NOTE - if this errors, it is likely do to poor connection, so try again.

TARGETS = [
    # (remote path, local folder)
    ('/pub/project/cancerrxgene/releases/release-7.0/sanger1018_brainarray_ensemblgene_rma.txt.gz',
     'data/raw/gene_expression'),
    ('/pub/project/cancerrxgene/releases/release-7.0/WES_variants.xlsx', 
     'data/raw/variants'),
]

# Maximum number of retries for FTP connection (sometime the Sanger site is down)
max_retries = 5

for attempt in range(1, max_retries + 1):
    try:
        with FTP('ftp.sanger.ac.uk') as ftp:
            ftp.login()
            for remote, local_dir in TARGETS:
                print(local_dir)
                local = Path(local_dir) / Path(remote).name
                local.parent.mkdir(parents=True, exist_ok=True)
                if local.exists():
                    print(f"{local.name} already present")
                    continue
                with open(local, 'wb') as fh:
                    print(f"⬇ {remote}")
                    ftp.retrbinary(f'RETR {remote}', fh.write)
        break  # Success, exit loop
    except Exception as e:
        print(f"Sanger site not accessible, try again! Attempt {attempt} of {max_retries}")
        if attempt < max_retries:
            time.sleep(3)
        else:
            print("Failed after 5 attempts:", e)

data/raw/gene_expression
sanger1018_brainarray_ensemblgene_rma.txt.gz already present
data/raw/variants
WES_variants.xlsx already present


# 2.0: Clean up the Geo Methylation data and add COSMIC_ID for joining data sets
- remove _AVE_beta from each cell name. Drop _Predictive_PVAL
- Transpose data frame to row = cell and column = cg id


For GSE68379_Matrix.processed.txt.gz file. The shape of the meth_raw_df data frame read from the file is (485512, 2057), with column names being in the format of: 

[0] = 'Row.names' containing cg[000000###]

[1], [1:1028] = [cell-line id]_AVG.Beta, containing numeric data

[1029:2057] = [cell-line id]_Detection.PVal, containing numeric data

Need to transform the data frame to get each cell line and the associated data on a row.

## 2.1: Read methylation data in and build meth_df dataframe with cell id and Average beta data for each probe id

1. Read data in using Chunks

In [9]:
# The file size is approximately 4 GB. A chunk size of 50,000 rows is a reasonable starting point
# to avoid memory issues while processing the file in chunks.
# THIS WILL TAKE A FEW MINUTES

meth_df_processed_path = DATA_RAW / "methylation/meth_df_cellID_only.parquet"

if run_meth_ingest_and_transpose:
    chunk_size = 50000
    print(f"Chosen chunk size: {chunk_size}")


    # Read the gzipped CSV file in chunks
    methylation_chunks = pd.read_csv(
        DATA_RAW / "methylation/GSE68379_Matrix.processed.txt.gz",
        sep="\t",
        chunksize=chunk_size,
        low_memory=False
    )
    print('File read.')

    print('Process Chunks')
    # This will return an iterator, so no output is expected immediately.
    # Process the chunks in subsequent steps.

    processed_chunks = [] # List of chunk dataframes 

    for chunk in methylation_chunks:
        # Identify beta columns (excluding the 'Row.names' column)
        beta_cols = [col for col in chunk.columns if col.endswith('_AVG.Beta')]

        # Extract 'Row.names' and beta columns
        beta_chunk = chunk[['Row.names'] + beta_cols].copy()

        # Clean column names by removing the _AVG.Beta suffix
        beta_chunk.columns = ['Row.names'] + [col[:-9] for col in beta_cols]

        # Set 'Row.names' as index
        beta_chunk.set_index('Row.names', inplace=True)

        # Clean index and column names for grouping
        beta_chunk.columns = (
            beta_chunk.columns.str.replace(r"[ .]", "-", regex=True)
            .str.upper()
            .str.strip()
        )

        # Group by column names and calculate the mean to handle replicates
        beta_chunk = beta_chunk.groupby(beta_chunk.columns, axis=1).mean()

        # Transpose the chunk so rows are cell lines and columns are probes
        beta_chunk = beta_chunk.T

        # Append the processed chunk to the list
        processed_chunks.append(beta_chunk)

    print(f"Processed {len(processed_chunks)} chunks.")
    # Display the first few rows of the first processed chunk to verify
    if processed_chunks:
        #display(processed_chunks[0].head(5))
        print('Chunking Done!\n')

Chosen chunk size: 50000
File read.
Process Chunks
Processed 10 chunks.
Chunking Done!



2. Build meth_df from chunks

In [10]:
## Build methylation data frame meth_df from chunks

if run_meth_ingest_and_transpose:
    # make sure every chunk has identical, ordered index = cell id
    template_index = processed_chunks[0].index

    dfs = []
    for i, df in enumerate(processed_chunks):
        # safeguard: all chunks must share the same cells in the same order
        if not df.index.equals(template_index):
            # ① if they have the same cells but different order → reindex
            if set(df.index) == set(template_index):
                df = df.reindex(template_index)
            # ② if cells are missing or extra → fail fast
            else:
                raise ValueError(f"Chunk {i} has mismatching cell lines.")
        dfs.append(df.astype("float32"))     # cast early to save RAM

    # verify that probe columns are unique across chunks
    all_cols = sum((list(d.columns) for d in dfs), [])
    duplicates = pd.Series(all_cols).duplicated()

    if duplicates.any():
        dup_names = pd.Series(all_cols)[duplicates].unique()
        raise ValueError(f"Probe IDs repeated across chunks: {dup_names[:5]}…")

    # column-wise concatenation  (rows = cells, columns = probes)
    meth_df = pd.concat(
        dfs,
        axis=1,               # ← side-by-side
        join="inner",         # identical index already guaranteed
        verify_integrity=True # double-checks no duplicate columns slipped in
    )

    print("Final β-matrix shape:", meth_df.shape)   # (n_cells × n_probes)

    # Save meth_df to a compressed parquet file
    meth_df.to_parquet(meth_df_processed_path, compression="gzip")
    print(f"Saved meth_df_processed to {meth_df_processed_path}")



Final β-matrix shape: (1028, 485512)
Saved meth_df_processed to /Users/brianr/repos/UCBAIML-GDSC/data/raw/methylation/meth_df_cellID_only.parquet


In [None]:
# read in existing meth_df file
if 'meth_df' in locals() and not meth_df.empty:
    run_meth_ingest_and_transpose = False
    print("meth_df already exists, skipping ingestion and transposition.")
else:
    run_meth_ingest_and_transpose = True

if run_meth_ingest_and_transpose:
    # Read the processed file back into a DataFrame
    print("Read meth_df parquet file")
    meth_df = read_parque_to_pd_df(meth_df_processed_path)
  

meth_df already exists, skipping ingestion and transposition.


## 2.3: Add Comsic_ID to meth_df


In [None]:
# Add comsic_id to meth_df

# 1 ▸  build a lookup   {CELL_LINE_NAME → COSMIC_ID}
cl_meta = pd.read_excel("data/raw/Cell_Lines_Details.xlsx")          # release-8.4
name_to_cosmic = (
    cl_meta.assign(NAME=lambda d: d["Sample Name"].str.upper())
           .set_index("NAME")["COSMIC identifier"]
           .to_dict()
)

# Ensure meth_df has a 'cell_id' column; if not, use the index as cell_id
if 'cell_id' not in meth_df.columns:
    meth_df['cell_id'] = meth_df.index

# Standardize cell_id for matching
meth_df['cell_id_upper'] = meth_df['cell_id'].str.upper().str.strip()

# Map COSMIC_ID using name_to_cosmic
meth_df['COSMIC_ID'] = meth_df['cell_id_upper'].map(name_to_cosmic).astype(int)

# Drop helper column
meth_df.drop(columns=['cell_id_upper'], inplace=True)

## 2.4 Save meth_df for use in EDA

In [None]:
# Reorder meth_df columns to have 'COSMIC_ID' and 'cell_id' as the first columns
cols = meth_df.columns.tolist()
first_cols = [col for col in ['COSMIC_ID', 'cell_id'] if col in cols]
other_cols = [col for col in cols if col not in first_cols]
meth_df = meth_df[first_cols + other_cols]

# Save meth_df to a compressed parquet file
meth_df.to_parquet(meth_df_cosmicid_path, compression="gzip")
print(f"Saved meth_df_processed to {meth_df_cosmicid_path}")

Saved meth_df_processed to /Users/brianr/repos/UCBAIML-GDSC/data/raw/methylation/meth_df_cosmic_ids.parquet
