In [2]:
import numpy as np
import h5py
import scanpy as sc
import os
import pandas as pd

In [89]:
f["var"]["gene_ids"]

<HDF5 dataset "gene_ids": shape (36601,), type "|O">

In [91]:
f["obs"]["APOE4 Status"]

<HDF5 dataset "APOE4 Status": shape (1957283,), type "|i1">

<h2>Data Structure

![alt text](anndata.svg "First look at Data-Structure")

<h3>Load data

In [3]:
f = h5py.File("data/SEAAD_MTG_RNAseq_all-nuclei.2022-08-18.h5ad")

In [4]:
f["X"]["indptr"]

<HDF5 dataset "indptr": shape (1957284,), type "<i8">

In [4]:
f["X"]["indices"]

<HDF5 dataset "indices": shape (9579432766,), type "<i8">

In [3]:
f["obs"]['APOE4 Status']

<HDF5 dataset "APOE4 Status": shape (1957283,), type "|i1">

<h3>List important markers

In [12]:
gene_ids = list(map(str.lower, np.array(f["var"]["gene_ids"]).astype('U13')))
gene_ids = list(np.array(f["var"]["gene_ids"]).astype('U13'))

<h4>CSF</h4>
Cerebrospinal fluid (CSF) biomarkers: Certain proteins found in cerebrospinal fluid, such as amyloid beta 42 (Aβ42), total tau (T-tau), and phosphorylated tau (P-tau), can serve as biomarkers for Alzheimer's disease. Decreased levels of Aβ42 and increased levels of T-tau and P-tau are associated with the presence of amyloid plaques and neurodegeneration.

In [13]:
search_marker = "CSF"
res = [i for i in gene_ids if search_marker.lower() in i.lower()]
res

['CSF3R',
 'CSF1',
 'CSF2',
 'CSF1R',
 'ACSF3',
 'CSF3',
 'ACSF2',
 'CSF2RB',
 'CSF2RA']

<h4>PET</h4>
Amyloid PET imaging: Positron Emission Tomography (PET) scans using specific radiotracers can detect and measure the accumulation of beta amyloid plaques in the brain. These scans can provide valuable information about the presence and extent of amyloid pathology.

In [14]:
search_marker = "PET"
res = [i for i in gene_ids if search_marker.lower() in i.lower()]
res

['PET100', 'PET117']

<h4>APOE</h4>
APOE ε4 allele: The APOE gene has different alleles, and the APOE ε4 allele is the most well-known genetic risk factor for late-onset Alzheimer's disease. Carrying one or two copies of the APOE ε4 allele increases the risk of developing the disease. APOE ε4 is associated with increased deposition of beta amyloid in the brain.

In [15]:
search_marker = "APOE"
res = [i for i in gene_ids if search_marker.lower() in i.lower()]
res

['APOE']

<h4>APP</h4>
Amyloid Precursor Protein (APP): APP is the precursor protein from which alpha and beta amyloid are derived. Mutations in the APP gene can lead to an increased production of beta amyloid and are associated with early-onset familial Alzheimer's disease.

In [16]:
search_marker = "APP"
res = [i for i in gene_ids if search_marker.lower() in i.lower()]
res

['TRAPPC3',
 'PAPPA2',
 'TRAPPC12',
 'TRAPPC12-AS1',
 'APPL1',
 'DAPP1',
 'TRAPPC11',
 'TRAPPC13',
 'TRAPPC3L',
 'TRAPPC9',
 'PAPPA',
 'PAPPA-AS2',
 'PAPPA-AS1',
 'TRAPPC4',
 'IAPP',
 'APPL2',
 'EAPP',
 'TRAPPC6B',
 'TRAPPC2L',
 'TRAPPC1',
 'APPBP2',
 'TRAPPC8',
 'TRAPPC5',
 'TRAPPC6A',
 'TRAPPC2B',
 'APP',
 'TRAPPC10',
 'TRAPPC2']

so app is also contained

Inflammatory markers: Chronic inflammation has been implicated in Alzheimer's disease. Biomarkers such as C-reactive protein (CRP), interleukin-6 (IL-6), and tumor necrosis factor-alpha (TNF-α) have been associated with inflammation and neurodegeneration in AD.

In [17]:
search_marker = "TNF"
res = [i for i in gene_ids if search_marker.lower() in i.lower()]
res

['TNFRSF18',
 'TNFRSF4',
 'C1QTNF12',
 'TNFRSF14-AS1',
 'TNFRSF14',
 'TNFRSF25',
 'TNFRSF9',
 'TNFRSF8',
 'TNFRSF1B',
 'TNFAIP8L2',
 'TNFSF18',
 'TNFSF4',
 'TNFAIP6',
 'TNFSF10',
 'C1QTNF7',
 'C1QTNF3',
 'TNFAIP8',
 'C1QTNF2',
 'TNF',
 'TNFRSF21',
 'TNFAIP3',
 'TNFRSF10B',
 'TNFRSF10C',
 'TNFRSF10D',
 'TNFRSF10A-AS1',
 'TNFRSF10A',
 'TNFRSF11B',
 'TNFSF15',
 'TNFSF8',
 'C1QTNF4',
 'C1QTNF5',
 'TNFRSF1A',
 'TNFRSF19',
 'C1QTNF9B',
 'C1QTNF9',
 'C1QTNF9-AS1',
 'TNFSF11',
 'TNFSF13B',
 'TNFAIP2',
 'TNFAIP8L3',
 'C1QTNF8',
 'TNFRSF12A',
 'TNFRSF17',
 'TNFSF12',
 'TNFSF13',
 'TNFRSF13B',
 'TNFAIP1',
 'C1QTNF1-AS1',
 'C1QTNF1',
 'TNFRSF11A',
 'TNFAIP8L1',
 'TNFSF9',
 'TNFSF14',
 'TNFRSF6B',
 'C1QTNF6',
 'TNFRSF13C']

In [92]:
search_marker = "KCNB1"
res = [i for i in gene_ids if search_marker.lower() in i.lower()]
res

['KCNB1']

In [36]:
for i in f["obs"].keys():
    print(i)

APOE4 Status
ATAC_Confidently_mapped_read_pairs
ATAC_Fraction_of_genome_in_peaks
ATAC_Fraction_of_high_quality_fragments_in_cells
ATAC_Fraction_of_high_quality_fragments_overlapping_TSS
ATAC_Fraction_of_high_quality_fragments_overlapping_peaks
ATAC_Fraction_of_transposition_events_in_peaks_in_cells
ATAC_Mean_raw_read_pairs_per_cell
ATAC_Median_high_quality_fragments_per_cell
ATAC_Non-nuclear_read_pairs
ATAC_Number_of_peaks
ATAC_Percent_duplicates
ATAC_Q30_bases_in_barcode
ATAC_Q30_bases_in_read_1
ATAC_Q30_bases_in_read_2
ATAC_Q30_bases_in_sample_index_i1
ATAC_Sequenced_read_pairs
ATAC_TSS_enrichment_score
ATAC_Unmapped_read_pairs
ATAC_Valid_barcodes
Age at Death
Arteriolosclerosis
Atherosclerosis
Braak
Brain Region
Brain pH
CERAD score
Class
Class confidence
Cognitive Status
Donor ID
Doublet score
Fraction mitochondrial UMIs
Fresh Brain Weight
GEX_Estimated_number_of_cells
GEX_Fraction_of_transcriptomic_reads_in_cells
GEX_Mean_raw_reads_per_cell
GEX_Median_UMI_counts_per_cell
GEX_Media

In [55]:
cog_status = np.array(f["obs"]["Cognitive Status"])
print(len(cog_status[cog_status == 0]))
print(len(cog_status[cog_status == 1]))#reference
print(len(cog_status[cog_status == 2]))

141703
1004945
810635


In [57]:
np.unique(f["obs"]["Donor ID"])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88], dtype=int8)

In [60]:
len(f["obs"]["Supertype"])

1957283

In [72]:
df = pd.DataFrame(columns = f["obs"].keys())

In [76]:
np.array(f["obs"]["Hispanic"])

array(['Latino'], dtype='<U6')

In [81]:
df = pd.DataFrame()
for i in f["obs"].keys():
    print(i)
    if i not in ["Hispanic", "Race (choice=American Indian", "Race (choice=Black", "__categories"]:
        df[i] = np.array(f["obs"][i])

APOE4 Status
ATAC_Confidently_mapped_read_pairs
ATAC_Fraction_of_genome_in_peaks
ATAC_Fraction_of_high_quality_fragments_in_cells
ATAC_Fraction_of_high_quality_fragments_overlapping_TSS
ATAC_Fraction_of_high_quality_fragments_overlapping_peaks
ATAC_Fraction_of_transposition_events_in_peaks_in_cells
ATAC_Mean_raw_read_pairs_per_cell
ATAC_Median_high_quality_fragments_per_cell
ATAC_Non-nuclear_read_pairs
ATAC_Number_of_peaks
ATAC_Percent_duplicates
ATAC_Q30_bases_in_barcode
ATAC_Q30_bases_in_read_1
ATAC_Q30_bases_in_read_2
ATAC_Q30_bases_in_sample_index_i1
ATAC_Sequenced_read_pairs
ATAC_TSS_enrichment_score
ATAC_Unmapped_read_pairs
ATAC_Valid_barcodes
Age at Death
Arteriolosclerosis
Atherosclerosis
Braak
Brain Region
Brain pH
CERAD score
Class
Class confidence
Cognitive Status
Donor ID
Doublet score
Fraction mitochondrial UMIs
Fresh Brain Weight
GEX_Estimated_number_of_cells
GEX_Fraction_of_transcriptomic_reads_in_cells
GEX_Mean_raw_reads_per_cell
GEX_Median_UMI_counts_per_cell
GEX_Media

  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])


ar_id
avg_size_bp


  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])


batch_vendor_name
bc
cell_prep_type
exp_component_name


  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])


exp_component_vendor_name
expc_cell_capture
experiment_component_failed
facs_population_plan
library_input_ng
library_prep
library_prep_pass_fail
load_name


  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])


method
pcr_cycles
percent_cdna_longer_than_400bp
quantification_fmol
r1_index
rna_amplification
rna_amplification_pass_fail
sample_id
sample_name
sample_quantity_count
specify other race


  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])
  df[i] = np.array(f["obs"][i])


In [9]:
np.unique(f["obs"]["Supertype"])

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147], dtype=int16)

In [86]:
#df.to_csv("obs_all-nuclei.2022-08-18.csv")