# Housekeeping

## Library imports

In [None]:
from click.formatting import iter_rows

if False:
    import sys
    !{sys.executable} -m pip install -r requirements.txt

In [None]:
import pandas as pd

import sklearn
from sklearn import tree
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, classification_report, roc_auc_score, RocCurveDisplay, precision_recall_curve, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

from colorama import Fore, Back, Style

import warnings

import matplotlib.pyplot as plt

## Settings

In [None]:
warnings.filterwarnings("ignore")
sklearn.set_config(transform_output="pandas")
print(Style.RESET_ALL)

## Data imports
Data was manually edited, to convert the mpa411.txt TSV format to a CSV format. Otherwise, Pandas was loading it as a single column, somehow. The first row, containing only "#mpa_vJun23_CHOCOPhlAnSGB_202403" was removed.

In [None]:
data = pd.read_csv('../data/raw/MAI3004_lucki_mpa411.csv')
metadata = pd.read_csv('../data/raw/MAI3004_lucki_metadata_safe.csv')
print(f"Data successfully imported. \n shape of data: {data.shape} \n Shape of metadata: {metadata.shape}")

assert data.shape == (6903, 932), "Data has the wrong shape. Check the CSV formatting."
assert metadata.shape == (930, 6), "Metadata has the wrong shape. Check the CSV formatting."

## Function definitions
| Function Name | Description | Parameters |
|---------------|-------------|------------|


# Data preprocessing

## Merge data and metadata

In [None]:
sample_cols = [col for col in data.columns if col.startswith("mpa411_")]

sample_abundances = (
    data[['clade_name'] + sample_cols]
    .set_index('clade_name')
    .transpose()
    .rename_axis('original_sample_id')
    .reset_index()
    .rename(columns={'original_sample_id': 'sample_id'})
)

sample_abundances['sample_id'] = sample_abundances['sample_id'].str.removeprefix('mpa411_')

metadata_common = metadata[metadata['sample_id'].isin(sample_abundances['sample_id'])].copy()
merged_samples = metadata_common.merge(sample_abundances, on='sample_id', how='inner')

merged_samples.drop(columns=['year_of_birth', 'body_product'], inplace=True)
# YOB and body_product are useless to us, as we do not know the date of sample, and all samples were fecal
# TODO: should we be accounting for sex? Do statistical analysis

print(f"Metadata rows (original): {metadata.shape[0]}")
print(f"Metadata rows with matching samples: {metadata_common.shape[0]}")
print(f"Metadata rows without matching samples: {metadata_common.shape[0]-metadata_common.shape[0]}")
print(f"Merged dataframe shape: {merged_samples.shape}")

In [None]:
merged_samples.head()

## Encoding

## Missing check

In [None]:
missing_table = (
    merged_samples.isna()
    .sum()
    .to_frame(name="missing_count")
    .assign(
        missing_percent=lambda df: (df["missing_count"] / merged_samples.shape[0] * 100).round(2)
    )
    .reset_index()
    .rename(columns={"index": "column"})
    .sort_values("missing_count", ascending=False)
    .query("missing_count != 0")
)

missing_table


## Outlier check

## EDA

In [None]:
print(merged_samples.shape)
merged_samples.head()

In [None]:
#dataset overview
print("Number of samples:", len(merged_samples))
print("Number of unique children (family_id):", merged_samples["family_id"].nunique())
print("Number of columns (metadata + features):", merged_samples.shape[1])

In [None]:
#samples per child
samples_per_child = merged_samples["family_id"].value_counts()
samples_per_child.describe()

In [None]:
samples_per_child.hist(bins=20)
plt.xlabel("Number of samples per child")
plt.ylabel("Number of children")
plt.title("Distribution of samples per child")
plt.show()

In [None]:
#distribution of age groups
merged_samples["age_group_at_sample"].value_counts(dropna=False)

In [None]:
merged_samples["age_group_at_sample"].value_counts().plot(kind="bar")
plt.title("Distribution of age groups")
plt.ylabel("Number of samples")
plt.xticks(rotation=45)
plt.show()

In [None]:
#dimensionality and sparsity of the microbiome feature matrix
metadata_cols = ["sample_id", "family_id", "sex", "body_product", "age_group_at_sample", "year_of_birth"]
feature_cols = [c for c in merged_samples.columns if c not in metadata_cols]

X = merged_samples[feature_cols]

print("Feature matrix shape:", X.shape)
print("Overall fraction of zeros:", (X == 0).mean().mean())

In [None]:
#number of observed taxa per sample
nonzero_per_sample = (X > 0).sum(axis=1)
nonzero_per_sample.describe()

In [None]:
nonzero_per_sample.hist(bins=50)
plt.xlabel("Number of non-zero taxa per sample")
plt.ylabel("Number of samples")
plt.title("Non-zero taxa per sample")
plt.show()

In [None]:
#total abundance per sample (sanity check)
total_abundance = X.sum(axis=1)
total_abundance.describe()

In [None]:
total_abundance.hist(bins=50)
plt.xlabel("Total abundance per sample")
plt.ylabel("Number of samples")
plt.title("Total microbial abundance per sample")
plt.show()

In [None]:
#distribution of feature prevalence
feature_prevalence = (X > 0).sum(axis=0)
feature_prevalence.describe()

In [None]:
feature_prevalence.hist(bins=50)
plt.xlabel("Number of samples in which taxon is present")
plt.ylabel("Number of taxa")
plt.title("Feature prevalence distribution")
plt.show()

In [None]:
#distribution of non-zero abundances (log scale)
import numpy as np

nonzero_values = X.values[X.values > 0]
plt.hist(np.log10(nonzero_values), bins=50)
plt.xlabel("log10(abundance)")
plt.ylabel("Frequency")
plt.title("Distribution of non-zero abundances (log10 scale)")
plt.show()

In [None]:
#PCA visualization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Use a subset of features for speed
prevalence = (X > 0).sum(axis=0)
top_features = prevalence.sort_values(ascending=False).head(500).index

X_sub = X[top_features]

# Scale features
X_scaled = StandardScaler().fit_transform(X_sub)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
age = merged_samples["age_group_at_sample"]

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:,0], X_pca[:,1], c=pd.factorize(age)[0], cmap="viridis", alpha=0.6)
plt.colorbar(label="Age group (encoded)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("PCA of samples colored by age group")
plt.show()

print("Explained variance ratio:", pca.explained_variance_ratio_)

### Summary of EDA
The dataset consists of 930 longitudinal samples from 139 children and contains approximately 6900 microbiome features, making it a high-dimensional and highly sparse dataset. Each sample contains on average around 300 detected taxa, while the total abundance per sample is relatively stable, indication proper normalization. Most taxa are rare and occur only in a small number of samples, while a small subset is highly prevalent. The distribution of non-zero abundances follows a log-normal pattern, which is typical for microbiome data (Lutz et al., 2022). A PCA projection reveals a clear age-related gradient, indicating that microbiome composition changes gradually with age and that age-related structure is strongly embedded in the data.