In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/repos/Epilepsy_Microglia
%pip install -q -r requirements.txt

In [None]:
# Single Cell Libraries
import scanpy as sc
import anndata as ad

# Data Processing and Plotting
import numpy as np
import pandas as pd

# File grab
import os
import tempfile
import pooch
import shutil, subprocess, glob
import gzip

In [1]:
# project name
project = "Epilepsy_Microglia"
dataset = "thrupp"
gse_accession = "GSE153807"
link = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE201048&format=file"

# environment setting
from env_utils import detect_env, get_paths
env = detect_env()
paths = get_paths(project)
print(env)
print(paths["base"])

code-server
/home/neuro_demo_research/data_from_drive/Epilepsy_Microglia


In [None]:
if(env == "colab"):
    download_dir = Path(f"/content/downloads/{dataset}")
elif(env == "code-server"):
    download_dir = Path(f"/home/neuro_demo_research/downloads/{dataset}")
else:
    download_dir = Path(f"~/neuro_demo_research/downloads/{dataset}").expanduser()

raw_dir = paths["raw"] / dataset

os.makedirs(download_dir, exist_ok=True)
os.makedirs(raw_dir, exist_ok=True)

In [None]:
# A. downloading mtx type data

LINK = DATA_INFO["LINK"]
FNAME = DATA_INFO["FNAME"]
RAW_DIR = DATA_INFO["RAW_DIR"]
OUT_DIR = DATA_INFO["OUT_DIR"]
LABEL = DATA_INFO["LABEL"]

# download + unpack
subprocess.run(["curl", "-L", LINK, "-o", f"{FNAME}.tar"], check=True)
subprocess.run(["tar", "-xf", f"{FNAME}.tar", "-C", RAW_DIR], check=True)
os.remove(f"{FNAME}.tar")
# normalize + regroup
files = glob.glob(os.path.join(RAW_DIR, "*"))
groups = {}
for f in files:
    fname_only = os.path.basename(f)
    prefix = fname_only.split("_")[0]
    groups.setdefault(prefix, []).append(f)
for prefix, flist in groups.items():
    sample_dir = os.path.join(OUT_DIR, prefix)
    os.makedirs(sample_dir, exist_ok=True)
    for f in flist:
        fname_only = os.path.basename(f)
        if "matrix.mtx" in fname_only:
            # check gunzip file
            if fname_only.endswith(".gz"):
                dest = os.path.join(sample_dir, "matrix.mtx.gz")
            else:
                dest = os.path.join(sample_dir, "matrix.mtx")
        elif "barcodes.tsv" in fname_only:
            if fname_only.endswith(".gz"):
                dest = os.path.join(sample_dir, "barcodes.tsv.gz")
            else:
                dest = os.path.join(sample_dir, "barcodes.tsv")
        elif "genes.tsv" in fname_only or "features.tsv" in fname_only:
            if fname_only.endswith(".gz"):
                dest = os.path.join(sample_dir, "features.tsv.gz")
            else:
                dest = os.path.join(sample_dir, "features.tsv")
        else:
            dest = os.path.join(sample_dir, fname_only)
        shutil.move(f, dest)

# build AnnData per sample and save separately
for folder in sorted(glob.glob(os.path.join(OUT_DIR, "*"))):
    if os.path.isdir(folder):
        sample_id = os.path.basename(folder)
        print(f"Reading {folder}")
        ad = sc.read_10x_mtx(folder, var_names="gene_symbols", make_unique=True)
        if sample_id in sample_meta:
          for key, value in sample_meta[sample_id].items():
            ad.obs[key] = value
        else:
          print(f"Warning: No metadata found for sample {sample_id}")

        # save
        out_file = os.path.join(OUT_DIR, f"{LABEL}_{sample_id}.h5ad")
        ad.write(out_file)
        print(f"Saved {out_file}")
        shutil.rmtree(folder)

In [None]:
# B-1. downloading csv/tsv type data

LINK = DATA_INFO["LINK"]
FNAME = DATA_INFO["FNAME"]
RAW_DIR = DATA_INFO["RAW_DIR"]
OUT_DIR = DATA_INFO["OUT_DIR"]
LABEL = DATA_INFO["LABEL"]

subprocess.run(["curl", "-L", LINK, "-o", f"{FNAME}.tar"], check=True)
subprocess.run(["tar", "-xf", f"{FNAME}.tar", "-C", RAW_DIR], check=True)
os.remove(f"{FNAME}.tar")

for f in glob.glob(os.path.join(RAW_DIR, "*.gz")):
    fname_only = os.path.basename(f)
    outpath = os.path.join(RAW_DIR, fname_only[:-3])  # remove ".gz"
    with gzip.open(f, 'rb') as f_in, open(outpath, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    os.remove(f)

In [None]:
# B-2. transforming tsv to csv

for tsvpath in glob.glob(os.path.join(RAW_DIR, "*.tsv")):
    fname_only = os.path.splitext(os.path.basename(tsvpath))[0]
    csvpath = os.path.join(OUT_DIR, f"{fname_only}.csv")
    try:
        df = pd.read_csv(tsvpath, sep="\t")
        df.to_csv(csvpath, index=False)
        os.remove(tsvpath)  # delete original.tsv
        print(f"Converted {tsvpath} -> {csvpath} (and deleted original)")
    except Exception as e:
        print(f"Error processing {tsvpath}: {e}")

In [None]:
# B-3. Transforming csv into h5ad

# Check cell-gene orientation
def check_csv_orientation(path, n_check=5):
    df = pd.read_csv(path, index_col=0, nrows=n_check)
    n_rows, n_cols = df.shape

    row_idx = df.index[0]
    col_idx = df.columns[0]

    if row_idx.startswith("ENSG") or row_idx.isalpha():
        orientation = "genes_as_rows"
    elif col_idx.startswith("ENSG") or col_idx.isalpha():
        orientation = "genes_as_columns"
    else:
        orientation = "unknown"

    print(f"{path}: {orientation} ({n_rows}X{n_cols})")
    return orientation

intermediate_files = []

# Read .csv into .h5ad
for f in glob.glob(os.path.join(RAW_DIR, "*.csv")):
  fname_only = os.path.splitext(os.path.basename(f))[0]

  ori = check_csv_orientation(f)

  if ori == "genes_as_rows":
    adata = sc.read_csv(f, first_column_names=True)
    adata = adata.T
  elif ori == "genes_as_columns":
    adata = sc.read_csv(f, first_column_names=True)
  else:
    raise ValueError(f"Unknown orientation for {f}")

  adata.obs["sample"] = fname_only
  tmp_path = os.path.join(OUT_DIR, f"{fname_only}.h5ad")
  adata.write(tmp_path)
  intermediate_files.append(tmp_path)

  print(f"Converted {f} -> {tmp_path}")
  del adata
  os.remove(f)

In [None]:
# B-4. Merge adatas

adatas = [sc.read_h5ad(f) for f in intermediate_files]

adata_merged = sc.concat(adatas, join="outer", label="sample", keys=[os.path.basename(f).replace(".h5ad","") for f in intermediate_files])

merged_path = os.path.join(OUT_DIR, f"{LABEL}_{FNAME}_merged.h5ad")
adata_merged.write(merged_path)