**1. Download single cell metadata**

 Since single cell metadatas take on different data types including .mtx, .csv and .tsv, an integrated downloading process is efficient for further research. Below are brief processes to download the desired data, respectively.

 In Case of .mtx (mostly from 2020 or later), follow A.
 For .tsv, follow B-1 through 3. For .csv, skip B-2. Concatenation takes considerable data storage. When it comes short, don't follow B-4. Otherwise follow B-4 to merge data into one.

 WARNING : Always set raw_dir in /contents/, not in your drive. It may cause undesirable storage overflow.

In [None]:
import torch

# Single Cell Libraries
import scvi
import scanpy as sc
import anndata as ad

# Data Processing and Plotting
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import igraph
import leidenalg

# File grab
import os
import tempfile
import pooch
import shutil, subprocess, glob
import gzip

print(torch.__version__)
print(scvi.__version__)
print(torch.cuda.is_available())

In [None]:
!rm -rf /content/drive/MyDrive/datas/epilepsy_microglia/raw/GSE201048_raw

In [None]:
# Set data information
FNAME = "GSE201048_raw"
DATA_INFO = {
    "FNAME": FNAME,
    "LINK": "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE201048&format=file",
    "LABEL": "kumar",
    "RAW_DIR": f"/content/data/{FNAME}",
    "OUT_DIR": f"/content/drive/MyDrive/datas/epilepsy_microglia/raw/{FNAME}"
}
os.makedirs(DATA_INFO["RAW_DIR"], exist_ok=True)
os.makedirs(DATA_INFO["OUT_DIR"], exist_ok=True)

In [None]:
# @title
# Clinical Metadata (batch key)

sample_meta = {
    "GSM6049632": {
        "patient_id": "Kumar01A",
        "sample_id": "GSM6049632",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "9",
        "dx": "OLE",
        "dx_subtype": "OLE",
        "region": "Occipital",
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049633": {
        "patient_id": "Kumar01B",
        "sample_id": "GSM6049633",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "9",
        "dx": "OLE",
        "dx_subtype": "OLE",
        "region": "Occipital",
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049634": {
        "patient_id": "Kumar02",
        "sample_id": "GSM6049634",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "4",
        "dx": "FCD",
        "dx_subtype": "FCD_IIb",
        "region": "Frontal",
        "hemisphere": "R",
        "procedure": "lesionectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049635": {
        "patient_id": "Kumar03A",
        "sample_id": "GSM6049635",
        "dataset": "GSE201048",
        "sex": "M",
        "age": "18",
        "dx": "FCD",
        "dx_subtype": "FCD_IIb",
        "region": "Frontal",
        "hemisphere": "L",
        "procedure": "lesionectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049636": {
        "patient_id": "Kumar03B",
        "sample_id": "GSM6049636",
        "dataset": "GSE201048",
        "sex": "M",
        "age": "18",
        "dx": "FCD",
        "dx_subtype": "FCD_IIb",
        "region": "Frontal",
        "hemisphere": "L",
        "procedure": "lesionectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049637": {
        "patient_id": "Kumar04",
        "sample_id": "GSM6049637",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "3",
        "dx": "TLE",
        "dx_subtype": "TLE",
        "region": "Temporal",
        "hemisphere": "L",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049638": {
        "patient_id": "Kumar05A",
        "sample_id": "GSM6049638",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "22",
        "dx": "SWS",
        "dx_subtype": "SWS",
        "region": "Temporal",  # clinically temporo-occipital, sequenced sample = temporal
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049639": {
        "patient_id": "Kumar05B",
        "sample_id": "GSM6049639",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "22",
        "dx": "SWS",
        "dx_subtype": "SWS",
        "region": "Temporal",
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049640": {
        "patient_id": "Kumar06A",
        "sample_id": "GSM6049640",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "4",
        "dx": "SRFSE",
        "dx_subtype": "SRFSE",
        "region": "Temporal",  # MRI also showed parietal involvement
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049641": {
        "patient_id": "Kumar06B",
        "sample_id": "GSM6049641",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "4",
        "dx": "SRFSE",
        "dx_subtype": "SRFSE",
        "region": "Temporal",
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049642": {
        "patient_id": "Kumar06B",
        "sample_id": "GSM6049642",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "4",
        "dx": "SRFSE",
        "dx_subtype": "SRFSE",
        "region": "Temporal",
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
    "GSM6049643": {
        "patient_id": "Kumar06B",
        "sample_id": "GSM6049643",
        "dataset": "GSE201048",
        "sex": "F",
        "age": "4",
        "dx": "SRFSE",
        "dx_subtype": "SRFSE",
        "region": "Temporal",
        "hemisphere": "R",
        "procedure": "lobectomy",
        "protocol": "CITE-seq",
    },
}


In [None]:
# A. downloading mtx type data

LINK = DATA_INFO["LINK"]
FNAME = DATA_INFO["FNAME"]
RAW_DIR = DATA_INFO["RAW_DIR"]
OUT_DIR = DATA_INFO["OUT_DIR"]
LABEL = DATA_INFO["LABEL"]

# download + unpack
subprocess.run(["curl", "-L", LINK, "-o", f"{FNAME}.tar"], check=True)
subprocess.run(["tar", "-xf", f"{FNAME}.tar", "-C", RAW_DIR], check=True)
os.remove(f"{FNAME}.tar")
# normalize + regroup
files = glob.glob(os.path.join(RAW_DIR, "*"))
groups = {}
for f in files:
    fname_only = os.path.basename(f)
    prefix = fname_only.split("_")[0]
    groups.setdefault(prefix, []).append(f)
for prefix, flist in groups.items():
    sample_dir = os.path.join(OUT_DIR, prefix)
    os.makedirs(sample_dir, exist_ok=True)
    for f in flist:
        fname_only = os.path.basename(f)
        if "matrix.mtx" in fname_only:
            # check gunzip file
            if fname_only.endswith(".gz"):
                dest = os.path.join(sample_dir, "matrix.mtx.gz")
            else:
                dest = os.path.join(sample_dir, "matrix.mtx")
        elif "barcodes.tsv" in fname_only:
            if fname_only.endswith(".gz"):
                dest = os.path.join(sample_dir, "barcodes.tsv.gz")
            else:
                dest = os.path.join(sample_dir, "barcodes.tsv")
        elif "genes.tsv" in fname_only or "features.tsv" in fname_only:
            if fname_only.endswith(".gz"):
                dest = os.path.join(sample_dir, "features.tsv.gz")
            else:
                dest = os.path.join(sample_dir, "features.tsv")
        else:
            dest = os.path.join(sample_dir, fname_only)
        shutil.move(f, dest)

# build AnnData per sample and save separately
for folder in sorted(glob.glob(os.path.join(OUT_DIR, "*"))):
    if os.path.isdir(folder):
        sample_id = os.path.basename(folder)
        print(f"Reading {folder}")
        ad = sc.read_10x_mtx(folder, var_names="gene_symbols", make_unique=True)
        if sample_id in sample_metadata:
          for key, value in sample_metadata[sample_id].items():
            ad.obs[key] = value
        else:
          print(f"Warning: No metadata found for sample {sample_id}")

        # save
        out_file = os.path.join(OUT_DIR, f"{LABEL}_{sample_id}.h5ad")
        ad.write(out_file)
        print(f"Saved {out_file}")
        shutil.rmtree(folder)

In [None]:
# B-1. downloading csv/tsv type data

LINK = DATA_INFO["LINK"]
FNAME = DATA_INFO["FNAME"]
RAW_DIR = DATA_INFO["RAW_DIR"]
OUT_DIR = DATA_INFO["OUT_DIR"]
LABEL = DATA_INFO["LABEL"]

subprocess.run(["curl", "-L", LINK, "-o", f"{FNAME}.tar"], check=True)
subprocess.run(["tar", "-xf", f"{FNAME}.tar", "-C", RAW_DIR], check=True)
os.remove(f"{FNAME}.tar")

for f in glob.glob(os.path.join(RAW_DIR, "*.gz")):
    fname_only = os.path.basename(f)
    outpath = os.path.join(RAW_DIR, fname_only[:-3])  # remove ".gz"
    with gzip.open(f, 'rb') as f_in, open(outpath, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    os.remove(f)

In [None]:
# B-2. transforming tsv to csv

for tsvpath in glob.glob(os.path.join(RAW_DIR, "*.tsv")):
    fname_only = os.path.splitext(os.path.basename(tsvpath))[0]
    csvpath = os.path.join(OUT_DIR, f"{fname_only}.csv")
    try:
        df = pd.read_csv(tsvpath, sep="\t")
        df.to_csv(csvpath, index=False)
        os.remove(tsvpath)  # delete original.tsv
        print(f"Converted {tsvpath} -> {csvpath} (and deleted original)")
    except Exception as e:
        print(f"Error processing {tsvpath}: {e}")

In [None]:
# B-3. Transforming csv into h5ad

# Check cell-gene orientation
def check_csv_orientation(path, n_check=5):
    df = pd.read_csv(path, index_col=0, nrows=n_check)
    n_rows, n_cols = df.shape

    row_idx = df.index[0]
    col_idx = df.columns[0]

    if row_idx.startswith("ENSG") or row_idx.isalpha():
        orientation = "genes_as_rows"
    elif col_idx.startswith("ENSG") or col_idx.isalpha():
        orientation = "genes_as_columns"
    else:
        orientation = "unknown"

    print(f"{path}: {orientation} ({n_rows}X{n_cols})")
    return orientation

intermediate_files = []

# Read .csv into .h5ad
for f in glob.glob(os.path.join(RAW_DIR, "*.csv")):
  fname_only = os.path.splitext(os.path.basename(f))[0]

  ori = check_csv_orientation(f)

  if ori == "genes_as_rows":
    adata = sc.read_csv(f, first_column_names=True)
    adata = adata.T
  elif ori == "genes_as_columns":
    adata = sc.read_csv(f, first_column_names=True)
  else:
    raise ValueError(f"Unknown orientation for {f}")

  adata.obs["sample"] = fname_only
  tmp_path = os.path.join(OUT_DIR, f"{fname_only}.h5ad")
  adata.write(tmp_path)
  intermediate_files.append(tmp_path)

  print(f"Converted {f} -> {tmp_path}")
  del adata
  os.remove(f)

In [None]:
# B-4. Merge adatas

adatas = [sc.read_h5ad(f) for f in intermediate_files]

adata_merged = sc.concat(adatas, join="outer", label="sample", keys=[os.path.basename(f).replace(".h5ad","") for f in intermediate_files])

merged_path = os.path.join(OUT_DIR, f"{LABEL}_{FNAME}_merged.h5ad")
adata_merged.write(merged_path)