In [2]:
# loading libraries
import anndata as ad
import numpy as np
import pandas as pd
import umap
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

seed = 33


  from .autonotebook import tqdm as notebook_tqdm


In [18]:
# Preproessing
adata = ad.read_h5ad('../data/raw/processed_data.h5ad')


# Before preprocessing
print(f"complete age percentage: {(adata.obs['age'].notna().sum() / len(adata.obs)).round(2)}")
print(f"complete ages {adata.obs['age'].notna().sum()} / {len(adata.obs)}")


# age to numeric
age_numeric = []

for age_val in adata.obs['age']:
    if pd.isna(age_val):
        age_numeric.append(np.nan)
        continue
    
    age_str = str(age_val).strip().lower()
    
    # Skip invalid entries
    if any(x in age_str for x in ['not', 'unspecified', 'restricted', 'unknown']):
        age_numeric.append(np.nan)
        continue
    
    # Handle negative
    if age_str.startswith('-'):
        age_numeric.append(np.nan)
        continue
    
    # Handle '>=100'
    if age_str.startswith('>='):
        age_numeric.append(100)
        continue

    # Handle ranges 
    if '-' in age_str and age_str[0].isdigit():
        parts = age_str.split('-')
        try:
            age_numeric.append((float(parts[0]) + float(parts[1])) / 2)
            continue
        except:
            age_numeric.append(np.nan)
            continue
    
    # Handle weeks like '52 weeks' -> 1 year
    if 'week' in age_str:
        try:
            weeks = float(age_str.split()[0])
            age_numeric.append(weeks / 52)
            continue
        except:
            age_numeric.append(np.nan)
            continue
    
    # Handle days like '29 days' -> 0.08 years
    if 'day' in age_str:
        try:
            days = float(age_str.split()[0])
            age_numeric.append(days / 365)
            continue
        except:
            age_numeric.append(np.nan)
            continue
    
    # Handle months
    if 'month' in age_str:
        try:
            months = float(age_str.split()[0])
            age_numeric.append(months / 12)
            continue
        except:
            age_numeric.append(np.nan)
            continue
    
    # Handle 'year' suffix
    if 'year' in age_str:
        try:
            age_numeric.append(float(age_str.split()[0]))
            continue
        except:
            age_numeric.append(np.nan)
            continue
    
    # Direct conversion for numbers
    try:
        age = float(age_str)
        # Filter unrealistic ages
        if age < 0 or age > 120:
            age_numeric.append(np.nan)
        else:
            age_numeric.append(age)
    except:
        age_numeric.append(np.nan)

# Save to adata
adata.obs['age_numeric'] = age_numeric

# Create categories
age_category = []
for age in adata.obs['age_numeric']:
    if pd.isna(age):
        age_category.append('Unknown')
    elif age < 1:
        age_category.append('0-1')
    elif age < 18:
        age_category.append('1-18')
    elif age < 30:
        age_category.append('18-30')
    elif age < 50:
        age_category.append('30-50')
    elif age < 70:
        age_category.append('50-70')
    else:
        age_category.append('70+')

adata.obs['age_category'] = age_category

# Print summary
print(f"Successfully parsed: {adata.obs['age_numeric'].notna().sum()} / {len(adata.obs)} samples")
print(f"\nAge statistics:")
print(adata.obs['age_numeric'].describe())
print(f"\nAge category distribution:")
print(adata.obs['age_category'].value_counts().sort_index())


# Save processed data
adata.write_h5ad('../data/processed/adjusted_data.h5ad')


complete age percentage: 0.27
complete ages 45924 / 168464
Successfully parsed: 38369 / 168464 samples

Age statistics:
count    38369.000000
mean        40.026807
std         25.784215
min          0.000000
25%         20.000000
50%         41.000000
75%         61.000000
max        120.000000
Name: age_numeric, dtype: float64

Age category distribution:
age_category
0-1          2452
1-18         6456
18-30        5858
30-50        7494
50-70       11168
70+          4941
Unknown    130095
Name: count, dtype: int64


In [None]:
# Overview / Exploration of Processed Data



print("--- AnnData Summary ---")
print(adata)
print(f"Shape {adata.shape}")
             


# now for all variables in obs
for col in adata.obs.columns:
    print(f"\n{col} Variable Summary:")
    print(adata.obs[col].value_counts())




--- AnnData Summary ---
AnnData object with n_obs × n_vars = 168464 × 4680
    obs: 'srs', 'project', 'srr', 'library_strategy', 'library_source', 'pubdate', 'total_bases', 'instrument', 'geo_loc_name', 'iso', 'region', 'sex', 'age', 'bmi_cat', 'IBD', 'diabetes', 'age_numeric', 'age_category'
    var: 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus'
Shape (168464, 4680)

srs Variable Summary:
srs
SRS584960    1
SRS586060    1
SRS585875    1
SRS587237    1
SRS587306    1
            ..
DRS193781    1
DRS193782    1
DRS193783    1
DRS193784    1
DRS193785    1
Name: count, Length: 168464, dtype: int64

project Variable Summary:
project
PRJEB11419     4850
PRJNA729511    3941
PRJNA545312    3690
PRJEB27068     3546
PRJNA607574    3524
               ... 
PRJNA630848      50
PRJEB27978       50
PRJNA659245      50
PRJNA380011      50
PRJEB5714        50
Name: count, Length: 482, dtype: int64

srr Variable Summary:
srr
SRR1211167    1
SRR1212801    1
SRR1212600    1
SRR1214300    1
S

In [None]:
# PCA

dense_data = adata.X.toarray()

# Fit and transform
pca = PCA(n_components = 50, random_state = seed)
X_pca = pca.fit_transform(dense_data)

# Save
adata.obsm['X_pca'] = X_pca


# Check explained variance
print(f"Explained variance ratio: {pca.explained_variance_ratio_[:5]}")
print(f"Cumulative variance: {np.cumsum(pca.explained_variance_ratio_)[:]}")

Explained variance ratio: [0.28446061 0.09184872 0.07719532 0.0608394  0.04618895]
Cumulative variance: [0.28446061 0.37630933 0.45350465 0.51434406 0.56053301 0.60297684
 0.64078244 0.67073922 0.69317221 0.71195218 0.72788526 0.74320878
 0.75604712 0.76795629 0.77887877 0.78882801 0.7984284  0.80761483
 0.81634343 0.82353444 0.830041   0.83618516 0.84209281 0.84788756
 0.85355757 0.85874066 0.86375531 0.8685564  0.87268212 0.87671812
 0.88058243 0.88426171 0.88767076 0.89087131 0.89404628 0.89690902
 0.89951329 0.9020901  0.90464123 0.90706247 0.90936417 0.91158995
 0.91377027 0.91588775 0.91795245 0.91995784 0.92179692 0.92361043
 0.92531644 0.92700038]
