# General Information

In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from scipy.sparse import issparse
print(ad.__version__)

0.11.4


In [2]:
adata = ad.read_h5ad("C:/Users/Tycho/Desktop/SchoolTU/year3/q4_RP/data/0fce5dd5-bcec-4288-90b3-19a16b45ad16.h5ad", backed='r')
print(adata)

AnnData object with n_obs × n_vars = 1058909 × 36169 backed at 'C:\\Users\\Tycho\\Desktop\\SchoolTU\\year3\\q4_RP\\data\\0fce5dd5-bcec-4288-90b3-19a16b45ad16.h5ad'
    obs: 'reference_genome', 'gene_annotation_version', 'alignment_software', 'intronic_reads_counted', 'library_id', 'assay_ontology_term_id', 'sequenced_fragment', 'cell_number_loaded', 'institute', 'is_primary_data', 'cell_type_ontology_term_id', 'author_cell_type', 'sample_id', 'sample_preservation_method', 'tissue_ontology_term_id', 'development_stage_ontology_term_id', 'sample_collection_method', 'donor_BMI_at_collection', 'tissue_type', 'suspension_derivation_process', 'suspension_enriched_cell_types', 'cell_viability_percentage', 'suspension_uuid', 'suspension_type', 'donor_id', 'self_reported_ethnicity_ontology_term_id', 'donor_living_at_sample_collection', 'organism_ontology_term_id', 'disease_ontology_term_id', 'sex_ontology_term_id', 'Country', 'nCount_RNA', 'nFeature_RNA', 'TCR_VDJdb', 'TCRa_V_gene', 'TCRa_D_gen

In [3]:
# Print basic info
print(f"AnnData object with {adata.n_obs} cells and {adata.n_vars} genes\n")

# 1. Print all cell types and their counts
print("=== Cell Types ===")
print(adata.obs['cell_type'].value_counts().to_string(), "\n")

# 2. Print 10 random cells with their metadata
print("=== 10 Cells (Observations) ===")
print(adata.obs[['cell_type', 'development_stage', 'tissue', 'sex', 'donor_id', 'nCount_RNA', 'nFeature_RNA']].sample(10, random_state=42).to_string(), "\n")

# 3. Print 10 random genes
print("=== 10 Genes (Variables) ===")
print(adata.var[['feature_name', 'feature_biotype', 'feature_length']].sample(10, random_state=42).to_string(), "\n")
print(adata.var[['feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type']].sample(10, random_state=42).to_string(), "\n")

# 4. Print some key features
print("=== Key Features ===")
print(f"Number of unique tissues: {adata.obs['tissue'].nunique()}")
print(f"Number of unique donors: {adata.obs['donor_id'].nunique()}")
print(f"Available embeddings: {list(adata.obsm.keys())}")
print(f"Size factors (nCount_RNA) range: {adata.obs['nCount_RNA'].min():.1f}-{adata.obs['nCount_RNA'].max():.1f}")
print(f"Genes detected (nFeature_RNA) range: {adata.obs['nFeature_RNA'].min()}-{adata.obs['nFeature_RNA'].max()}")

AnnData object with 1058909 cells and 36169 genes

=== Cell Types ===
CD14-positive monocyte                                   177896
naive thymus-derived CD4-positive, alpha-beta T cell     143515
CD16-positive, CD56-dim natural killer cell, human       135960
central memory CD4-positive, alpha-beta T cell           111905
naive thymus-derived CD8-positive, alpha-beta T cell      72925
CD8-positive, alpha-beta cytotoxic T cell                 63787
CD14-low, CD16-positive monocyte                          41913
CD8-positive, alpha-beta memory T cell                    39218
naive B cell                                              32515
memory B cell                                             30326
gamma-delta T cell                                        25474
effector memory CD4-positive, alpha-beta T cell           25184
mucosal invariant T cell                                  19837
CD4-positive, alpha-beta T cell                           19329
T cell                            

In [4]:
# Find all the unique feature biotypes
print("=== Unique Feature Biotypes ===")
print(adata.var['feature_biotype'].unique(), "\n")

# Find all the unique feature references
print("=== Unique Feature Types ===")
print(adata.var['feature_type'].unique(), "\n")
unique_feature_types = adata.var['feature_type'].unique()
for feature_type in unique_feature_types:
    # print(f"Number of {feature_type} features: {np.sum(adata.var['feature_type'] == feature_type)}")
    print(feature_type)
print("\n")

# Find all the unique feature references
print("=== Unique Feature References ===")
print(adata.var['feature_reference'].unique(), "\n")

# Find all the unique feature names
print("=== Unique Feature Names ===")
print(adata.var['feature_name'].unique(), "\n")

# Find all the unique feature lengths
print("=== Unique Feature Lengths ===")
print(adata.var['feature_length'].unique(), "\n")

=== Unique Feature Biotypes ===
['gene']
Categories (1, object): ['gene'] 

=== Unique Feature Types ===
['lncRNA', 'protein_coding', 'transcribed_unprocessed_pseudogene', 'unprocessed_pseudogene', 'processed_pseudogene', ..., 'TR_D_gene', 'unitary_pseudogene', 'TEC', 'IG_D_gene', 'artifact']
Length: 18
Categories (18, object): ['IG_C_gene', 'IG_D_gene', 'IG_J_gene', 'IG_V_gene', ..., 'transcribed_unitary_pseudogene', 'transcribed_unprocessed_pseudogene', 'unitary_pseudogene', 'unprocessed_pseudogene'] 

lncRNA
protein_coding
transcribed_unprocessed_pseudogene
unprocessed_pseudogene
processed_pseudogene
IG_V_gene
IG_C_gene
IG_J_gene
transcribed_unitary_pseudogene
TR_C_gene
TR_J_gene
TR_V_gene
transcribed_processed_pseudogene
TR_D_gene
unitary_pseudogene
TEC
IG_D_gene
artifact


=== Unique Feature References ===
['NCBITaxon:9606']
Categories (1, object): ['NCBITaxon:9606'] 

=== Unique Feature Names ===
['MIR1302-2HG', 'FAM138A', 'OR4F5', 'ENSG00000238009.6', 'ENSG00000239945.1', ..., '

In [5]:
# Initialize an array to count cells expressing each gene
gene_counts = np.zeros(adata.n_vars)

# Process in chunks (adjust chunk_size based on your RAM)
chunk_size = 10000
n_chunks = int(np.ceil(adata.n_obs / chunk_size))

for i in range(n_chunks):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, adata.n_obs)
    chunk_X = adata[start:end].X  # Loads the chunk into memory
    
    if issparse(chunk_X):
        gene_counts += (chunk_X > 0).sum(axis=0).A1  # For sparse matrices
    else:
        gene_counts += (chunk_X > 0).sum(axis=0)     # For dense matrices

# Calculate expression frequency
gene_expression_freq = gene_counts / adata.n_obs

# Create results DataFrame
gene_stats = pd.DataFrame({
    'gene': adata.var['feature_name'],
    'expression_frequency': gene_expression_freq
}).sort_values('expression_frequency', ascending=False)

In [6]:
print("=== Top 10 Expressed Genes ===")
print(gene_stats.head(10).to_string(index=False))

print("\n=== Bottom 10 Rare Genes ===")
print(gene_stats.tail(10).to_string(index=False))

print("\n=== 10 Genes with lower than 0.5 Expression ===")
print(gene_stats[gene_stats['expression_frequency'] < 0.5].head(10).to_string(index=False))

=== Top 10 Expressed Genes ===
                  gene  expression_frequency
   B2M_ENSG00000166710              0.999775
  ACTB_ENSG00000075624              0.999460
MT-CO1_ENSG00000198804              0.999438
MT-CO2_ENSG00000198712              0.999215
TMSB4X_ENSG00000205542              0.999213
 HLA-B_ENSG00000234745              0.998507
MT-CO3_ENSG00000198938              0.998420
  PTMA_ENSG00000187514              0.997119
                 HLA-C              0.996514
  FTH1_ENSG00000167996              0.996419

=== Bottom 10 Rare Genes ===
             gene  expression_frequency
ENSG00000260959.1                   0.0
ENSG00000277210.3                   0.0
ENSG00000248627.1                   0.0
ENSG00000226218.1                   0.0
ENSG00000250908.1                   0.0
ENSG00000235435.1                   0.0
ENSG00000235279.1                   0.0
          FAM138F                   0.0
ENSG00000244125.1                   0.0
ENSG00000278915.1                   0.0

===