In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from rnanorm import FPKM, TPM

from gtfparse import read_gtf
import re

In [2]:
df_gtf = read_gtf("../datasets/gencode.v46.basic.annotation.gtf")

# Remove .1, .2, etc. from gene_id
def remove_version(s):
    return re.sub(r'\.\d+$', '', s)

gene_ids = df_gtf['gene_id'].to_list()
gene_ids = list(map(remove_version, gene_ids))
gene_ids = list(set(gene_ids))

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'level', 'tag', 'transcript_id', 'transcript_type', 'transcript_name', 'transcript_support_level', 'havana_transcript', 'exon_number', 'exon_id', 'hgnc_id', 'havana_gene', 'ont', 'protein_id', 'ccdsid', 'artif_dupl']


In [3]:
def read_datasets(data_path: str, meta_path: str, gene_ids: list):
    meta_data = pd.read_excel(meta_path)
    data = pd.read_csv(data_path, sep='\t')
    data = data.T
    data = data.iloc[:, data.columns.isin(gene_ids)]
    return data, meta_data

In [4]:
meta_burgos_path = "../datasets/datasets_raw/burgos_dbgap/burgos_dbgap_metadata.xlsx"
data_burgos_path = "../datasets/datasets_raw/burgos_dbgap/burgos_dbgap_counts.txt"
data_burgos_fpkm_path = "../datasets_processed/fpkm/burgos_dbgap_fpkm.txt"
data_burgos_tpm_path = "../datasets_processed/tpm/burgos_dbgap_tpm.txt"
data_burgos, meta_burgos = read_datasets(data_burgos_path, meta_burgos_path, gene_ids)

meta_silver_path = "../datasets/datasets_raw/silver_seq/silver_seq_metadata.xlsx"
data_silver_path = "../datasets/datasets_raw/silver_seq/silver_seq_counts.txt"
data_silver_fpkm_path = "../datasets_processed/fpkm/silver_seq_fpkm.txt"
data_silver_tpm_path = "../datasets_processed/tpm/silver_seq_tpm.txt"
data_silver, meta_silver = read_datasets(data_silver_path, meta_silver_path, gene_ids)

meta_toden_path = "../datasets/datasets_raw/toden/toden_metadata.xlsx"
data_toden_path = "../datasets/datasets_raw/toden/toden_counts.txt"
data_toden_fpkm_path = "../datasets_processed/fpkm/toden_fpkm.txt"
data_toden_tpm_path = "../datasets_processed/tpm/toden_tpm.txt"
data_toden, meta_toden = read_datasets(data_toden_path, meta_toden_path, gene_ids)

In [5]:
def normalize(data: pd.DataFrame, method: str, save_path: str|None=None):
    if method == "fpkm":
        fpkm = FPKM(gtf="../datasets/gencode.v46.basic.annotation.gtf").set_output(transform="pandas")
        df_transformed = fpkm.fit_transform(data)
    elif method == "tpm":
        tpm = TPM(gtf="../datasets/gencode.v46.basic.annotation.gtf").set_output(transform="pandas")
        df_transformed = tpm.fit_transform(data)
    else:
        raise ValueError("Invalid method")
    
    if save_path is not None:
        df_transformed.to_csv(save_path, sep='\t')

    return df_transformed

In [14]:
df_transformed_burgos_fpkm = normalize(data_burgos, "fpkm", data_burgos_fpkm_path)
df_transformed_burgos_tpm = normalize(data_burgos, "tpm", data_burgos_tpm_path)

df_transformed_silver_fpkm = normalize(data_silver, "fpkm", data_silver_fpkm_path)
df_transformed_silver_tpm = normalize(data_silver, "tpm", data_silver_tpm_path)

df_transformed_toden_fpkm = normalize(data_toden, "fpkm", data_toden_fpkm_path)
df_transformed_toden_tpm = normalize(data_toden, "tpm", data_toden_tpm_path)

In [16]:
import scanpy as sc

In [34]:
def transform_h5ad(data: pd.DataFrame, meta: pd.DataFrame, save_path: str):
    adata = sc.AnnData(data)
    adata.obs = meta
    adata.write(save_path)
    return adata

In [43]:
assert (df_transformed_burgos_tpm.index == meta_burgos['biospecimen_repository_sample_id'].to_list()).all()
assert (df_transformed_silver_tpm.index == meta_silver['sample_id_alias'].to_list()).all()
assert (df_transformed_toden_tpm.index == meta_toden['Run'].to_list()).all()

adata_burgos_fpkm = transform_h5ad(df_transformed_burgos_fpkm, meta_burgos, "../datasets_h5ad/fpkm/burgos_dbgap_fpkm.h5ad")
adata_burgos_tpm = transform_h5ad(df_transformed_burgos_tpm, meta_burgos, "../datasets_h5ad/tpm/burgos_dbgap_tpm.h5ad")

adata_silver_fpkm = transform_h5ad(df_transformed_silver_fpkm, meta_silver, "../datasets_h5ad/fpkm/silver_seq_fpkm.h5ad")
adata_silver_tpm = transform_h5ad(df_transformed_silver_tpm, meta_silver, "../datasets_h5ad/tpm/silver_seq_tpm.h5ad")

adata_toden_fpkm = transform_h5ad(df_transformed_toden_fpkm, meta_toden, "../datasets_h5ad/fpkm/toden_fpkm.h5ad")
adata_toden_tpm = transform_h5ad(df_transformed_toden_tpm, meta_toden, "../datasets_h5ad/tpm/toden_tpm.h5ad")

... storing 'submitted_subject_id' as categorical
... storing 'ApoE' as categorical
... storing 'ClinicalDXSummary' as categorical
... storing 'Control' as categorical
... storing 'AD' as categorical
... storing 'PD' as categorical
... storing 'DLB' as categorical
... storing 'VAD' as categorical
... storing 'PSP' as categorical
... storing 'Plaque.density' as categorical
... storing 'Braak.score' as categorical
... storing 'NIA.R' as categorical
... storing 'LB.Stage' as categorical
... storing 'sn_depigmentation' as categorical
... storing 'apoe_carrier' as categorical
... storing 'apoe_dose' as categorical
... storing 'donor_id_alias' as categorical
... storing 'donor_status_score' as categorical
... storing 'donor_group' as categorical
... storing 'apoe_carrier' as categorical
... storing 'apoe_dose' as categorical
... storing 'Tissue' as categorical
... storing 'Ethnicity' as categorical
... storing 'Gender' as categorical
... storing 'Center' as categorical
... storing 'Disease' 

## TODO: Make a combined h5ad for all the datasets

In [None]:
#TODO: Concat dataframes then concat metadata