In [1]:
# Renamed GSM6668947_P18_T.enes.tsv.gz to GSM6668947_P18_T.genes.tsv.gz
# Renamed GSM6668932_P5_TN.features.tsv.gz to GSM6668932_P5_TN.genes.tsv.gz

In [2]:
import scanpy as sc
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from copy import deepcopy
import os

# Files and prefixes

In [3]:
base_path = "../../original_datasets/ICI/GSE216329_RAW"

In [4]:
files = os.listdir(base_path)
files = [x for x in files if '.gz' in x]
len(files)

144

In [5]:
prefixes = np.unique([x.split('.')[0] for x in files]).tolist()
len(prefixes)

48

# Construct adata per file

In [6]:
# Build a mapping from gene symbol -> Ensembl ID using the files that contain both.
# (Needed because some samples provide only symbols, while others provide only Ensembl IDs.)
_sym_to_ens = {}
_ens_to_sym = {}

for prefix in prefixes:
    gene_file = f'{base_path}/{prefix}.genes.tsv.gz'
    gr = pd.read_csv(gene_file, sep='\t', header=None)
    if gr.shape[1] == 3:
        gene_id = gr.iloc[:, 0].astype(str)
        gene_sym = gr.iloc[:, 1].astype(str)
        for e, s in zip(gene_id, gene_sym):
            if e.startswith('ENSG') and (not s.startswith('ENSG')):
                _sym_to_ens.setdefault(s, set()).add(e)
                _ens_to_sym.setdefault(e, set()).add(s)

sym2ens = {s: next(iter(v)) for s, v in _sym_to_ens.items() if len(v) == 1}
ens2sym = {e: next(iter(v)) for e, v in _ens_to_sym.items() if len(v) == 1}

adatas = []

for prefix in tqdm(prefixes):

    # File names
    gene_file = f'{base_path}/{prefix}.genes.tsv.gz'
    barcode_file = f'{base_path}/{prefix}.barcodes.tsv.gz'
    mtx_file = f'{base_path}/{prefix}.matrix.mtx.gz'

    # Genes
    genes_raw = pd.read_csv(gene_file, sep='\t', header=None)

    if genes_raw.shape[1] == 3:
        genes = genes_raw.copy()
        genes.columns = ['gene_name', 'gene_symbol', 'data']

    elif genes_raw.shape[1] == 2:
        c0 = genes_raw.iloc[:, 0].astype(str)
        c1 = genes_raw.iloc[:, 1].astype(str)

        # Case A: Ensembl present (sometimes mixed with non-gene features in the same file)
        # We keep only Ensembl IDs later via `valid_gene`.
        if (c0 == c1).all() and c0.str.startswith('ENSG').any():
            gene_name = c0
            gene_symbol = c0.map(lambda e: ens2sym.get(e, e))

        # Case B: Symbol-only (both columns are symbols)
        elif (not c0.str.startswith('ENSG').any()) and (c0 == c1).all():
            gene_symbol = c0
            # normalize RP11. -> RP11- only if present
            if gene_symbol.str.contains(r'^RP\d+\.', regex=True).any():
                gene_symbol = gene_symbol.str.replace(r'^(RP\d+)\.', r'\1-', regex=True)
            gene_name = gene_symbol.map(lambda s: sym2ens.get(s, pd.NA))

        # Case C: Ensembl + symbol (2 columns)
        elif c0.str.startswith('ENSG').all() and (not c1.str.startswith('ENSG').all()):
            gene_name = c0
            gene_symbol = c1

        else:
            raise ValueError(f"Unexpected 2-col genes format for {prefix}: first rows={genes_raw.head(3).values.tolist()}")

        genes = pd.DataFrame({'gene_name': gene_name, 'gene_symbol': gene_symbol, 'data': pd.NA})

    else:
        raise ValueError(f"Unexpected gene file format: {gene_file} shape={genes_raw.shape}")

    genes['gene_name'] = genes['gene_name'].astype(str)
    genes['gene_symbol'] = genes['gene_symbol'].astype(str)

    # Drop genes that we could not map to Ensembl (only relevant for symbol-only files)
    valid_gene = genes['gene_name'].notna() & (genes['gene_name'].str.lower() != 'nan') & (genes['gene_name'] != '') & (genes['gene_name'].str.startswith('ENSG'))

    # Barcodes
    barcodes = pd.read_csv(
        barcode_file,
        sep='\t',
        header=None,
        names=['sample_id']
    )

    # Matrix
    mtx = sc.read_mtx(mtx_file).T

    # If feature_type is present, keep only Gene Expression rows.
    if genes['data'].notna().any() and (genes['data'] == 'Gene Expression').any():
        keep = (genes['data'] == 'Gene Expression').values
        genes = genes.loc[keep].reset_index(drop=True)
        mtx = mtx[:, keep].copy()
        valid_gene = valid_gene.loc[keep].reset_index(drop=True)

    # Apply mapping filter (and keep matrix aligned)
    if not bool(valid_gene.all()):
        genes = genes.loc[valid_gene].reset_index(drop=True)
        mtx = mtx[:, valid_gene.values].copy()

    # Assertions
    assert barcodes.shape[1] == 1
    assert (len(barcodes), len(genes)) == mtx.shape
    assert len(genes['gene_name'].unique()) == len(genes)

    # Set data: align across samples by Ensembl ID
    mtx.var = deepcopy(genes)
    mtx.var_names = mtx.var['gene_name'].copy()

    # Make obs_names unique across samples
    mtx.obs_names = [f"{x}-{prefix}" for x in barcodes['sample_id'].astype(str).tolist()]
    mtx.obs['sample_id'] = barcodes['sample_id'].astype(str).tolist()

    # Metadata
    geo, patient_id, t = prefix.split('_')
    mtx.obs['prefix'] = prefix
    mtx.obs['GEO'] = geo
    mtx.obs['patient_id'] = patient_id
    mtx.obs['T'] = t

    adatas.append(mtx)

100%|███████████████████████████████████████████| 48/48 [00:27<00:00,  1.74it/s]


In [7]:
intersection_genes = set(adatas[0].var_names)
for a in adatas[1:]:
    intersection_genes &= set(a.var_names)

In [8]:
common_genes = list(intersection_genes)
len(common_genes)

20341

In [9]:
# Example (after the fix above, var_names are gene symbols):
# i = 0
# adatas[i][:, ['RP11-34P13.3', 'FAM138A']]

In [10]:
for i, _adata in enumerate(adatas):
    adatas[i] = adatas[i][:, common_genes]

# Assertions

In [11]:
gene_names = adatas[0].var_names.tolist()
for i, _adata in enumerate(adatas):
    assert len(_adata.var_names.tolist()) == len(gene_names)

In [12]:
len(_adata.var_names)

20341

In [13]:
len(gene_names)

20341

In [14]:
_adata.var_names

Index(['ENSG00000111671', 'ENSG00000183260', 'ENSG00000173826',
       'ENSG00000169393', 'ENSG00000253159', 'ENSG00000229563',
       'ENSG00000136297', 'ENSG00000052344', 'ENSG00000115474',
       'ENSG00000138092',
       ...
       'ENSG00000215374', 'ENSG00000166126', 'ENSG00000141579',
       'ENSG00000136936', 'ENSG00000224511', 'ENSG00000112893',
       'ENSG00000117305', 'ENSG00000239590', 'ENSG00000166016',
       'ENSG00000179796'],
      dtype='object', name='gene_name', length=20341)

# Concatenate

In [15]:
import anndata as ad

In [16]:
adata = ad.concat(adatas)

  utils.warn_names_duplicates("obs")


# Add Metadata

In [17]:
import io
import tarfile
import urllib.request
import xml.etree.ElementTree as ET

gse = 'GSE216329'
url = f'https://ftp.ncbi.nlm.nih.gov/geo/series/GSE216nnn/{gse}/miniml/{gse}_family.xml.tgz'

with urllib.request.urlopen(url) as r:
    data = r.read()

with tarfile.open(fileobj=io.BytesIO(data), mode='r:gz') as tf:
    xml_member = [m for m in tf.getmembers() if m.name.endswith('.xml')][0]
    xml_bytes = tf.extractfile(xml_member).read()

root = ET.fromstring(xml_bytes)

strip_ns = lambda tag: tag.split('}', 1)[-1] if '}' in tag else tag

rows = []
for sample in root.iter():
    if strip_ns(sample.tag) != 'Sample':
        continue

    rec = {}
    for child in sample:
        if strip_ns(child.tag) == 'Accession':
            rec['GEO'] = (child.text or '').strip()
            break
    if not rec.get('GEO', '').startswith('GSM'):
        continue

    for child in sample.iter():
        t = strip_ns(child.tag)
        if t in {'Title', 'Source', 'Organism', 'Library-Strategy', 'Library-Source', 'Library-Selection'} and child.text:
            rec[t.lower().replace('-', '_')] = child.text.strip()
        if t == 'Characteristics':
            k = child.attrib.get('tag', None)
            v = (child.text or '').strip()
            if k:
                rec[f'ch1__{k}'] = v

    rows.append(rec)

sample_meta = pd.DataFrame(rows).drop_duplicates(subset=['GEO']).set_index('GEO')
adata.obs = adata.obs.join(sample_meta, on='GEO')

adata.obs.columns.tolist()

['sample_id',
 'prefix',
 'GEO',
 'patient_id',
 'T',
 'title',
 'source',
 'organism',
 'ch1__cell type',
 'ch1__tissue',
 'ch1__disease',
 'ch1__condition',
 'library_strategy',
 'library_source',
 'library_selection']

# Rename

In [18]:
adata.obs_names_make_unique()
adata.obs.rename(
    columns={
        'ch1__cell_type': 'cell_type',
        'ch1__disease': 'disease',
        'ch1__condition':'condition',
        'ch1__tissue': 'tissue'
    }, 
    inplace=True
)

In [19]:
adata

AnnData object with n_obs × n_vars = 222678 × 20341
    obs: 'sample_id', 'prefix', 'GEO', 'patient_id', 'T', 'title', 'source', 'organism', 'ch1__cell type', 'tissue', 'disease', 'condition', 'library_strategy', 'library_source', 'library_selection'

In [20]:
pd.crosstab(adata.obs['disease'], adata.obs['patient_id']).to_csv('patient_disease_crosstab.csv')

In [21]:
adata.obs['condition']

AAACCTGAGCTTATCG-1-GSM6668925_P1_T      On treatment
AAACCTGAGGATGCGT-1-GSM6668925_P1_T      On treatment
AAACCTGAGGTGATAT-1-GSM6668925_P1_T      On treatment
AAACCTGCACATAACC-1-GSM6668925_P1_T      On treatment
AAACCTGCACCTCGTT-1-GSM6668925_P1_T      On treatment
                                            ...     
TTTATGCCATGCCCGA.1-GSM6668972_P24_TN        Baseline
TTTATGCGTCTTCTCG.1-GSM6668972_P24_TN        Baseline
TTTCCTCTCGGATGTT.1-GSM6668972_P24_TN        Baseline
TTTGTCAAGGAGTTTA.1-GSM6668972_P24_TN        Baseline
TTTGTCAGTGGTCTCG.1-GSM6668972_P24_TN        Baseline
Name: condition, Length: 222678, dtype: object

In [22]:
adata.obs['disease']

AAACCTGAGCTTATCG-1-GSM6668925_P1_T      Arthritis
AAACCTGAGGATGCGT-1-GSM6668925_P1_T      Arthritis
AAACCTGAGGTGATAT-1-GSM6668925_P1_T      Arthritis
AAACCTGCACATAACC-1-GSM6668925_P1_T      Arthritis
AAACCTGCACCTCGTT-1-GSM6668925_P1_T      Arthritis
                                          ...    
TTTATGCCATGCCCGA.1-GSM6668972_P24_TN      No irAE
TTTATGCGTCTTCTCG.1-GSM6668972_P24_TN      No irAE
TTTCCTCTCGGATGTT.1-GSM6668972_P24_TN      No irAE
TTTGTCAAGGAGTTTA.1-GSM6668972_P24_TN      No irAE
TTTGTCAGTGGTCTCG.1-GSM6668972_P24_TN      No irAE
Name: disease, Length: 222678, dtype: object

In [23]:
import pandas as pd

ens = pd.Series(adata.var_names.astype(str), index=adata.var.index)
ens_nover = ens.str.replace(r"\.\d+$", "", regex=True)

sym = ens_nover.map(lambda e: ens2sym.get(e, pd.NA))

adata.var["gene_symbol"] = sym

adata.var["gene_id_ensembl"] = ens_nover

In [24]:
adata.var

Unnamed: 0_level_0,gene_symbol,gene_id_ensembl
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000111671,SPSB2,ENSG00000111671
ENSG00000183260,ABHD16B,ENSG00000183260
ENSG00000173826,KCNH6,ENSG00000173826
ENSG00000169393,ELSPBP1,ENSG00000169393
ENSG00000253159,PCDHGA12,ENSG00000253159
...,...,...
ENSG00000112893,MAN2A1,ENSG00000112893
ENSG00000117305,HMGCL,ENSG00000117305
ENSG00000239590,OR1J4,ENSG00000239590
ENSG00000166016,ABTB2,ENSG00000166016


In [25]:
adata.X.max(), adata.X.min()

(16688.0, 0.0)

In [26]:
adata.write_h5ad('../../preprocessed_datasets/ICI_original_concat.h5ad')

In [28]:
pd.crosstab(adata.obs['disease'], adata.obs['patient_id']).T

disease,Arthritis,Colitis/ Neurotoxicity,No irAE,Pneumonitis,Thyroiditis,Thyroiditis/ Nephritis,Thyroiditis/ Neurotoxicity / Nephritis,Thyroiditis/ Neurotoxicity/ Nephritis
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
P1,16346,0,0,0,0,0,0,0
P2,8936,0,0,0,0,0,0,0
P3,11053,0,0,0,0,0,0,0
P4,0,0,0,0,21941,0,0,0
P5,0,0,0,0,22389,0,0,0
P6,0,0,0,0,0,778,0,0
P7,8320,0,0,0,0,0,0,0
P8,0,0,0,12150,0,0,0,0
P9,0,0,12842,0,0,0,0,0
P10,0,0,0,13894,0,0,0,0
