<a href="https://colab.research.google.com/github/Sebukpor/Microbiome-Cytokine-Interactions/blob/main/cytokine_microbial_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install scikit-bio shap umap-learn --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.2/58.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
"""
Comprehensive analysis pipeline for microbiome-cytokine datasets, focusing on multiple sample types: stool, gut, mouth, nasal.
Includes diversity calculations, PERMANOVA, correlation analysis, differential abundance,
machine learning, and network analysis.
"""
import os
os.environ["SCIPY_ARRAY_API"] = "1" # Enable SciPy array API for SMOTE compatibility
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, mannwhitneyu
from scipy.spatial.distance import pdist, squareform
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.manifold import TSNE
from imblearn.over_sampling import SMOTE
import networkx as nx
from statsmodels.stats.multitest import multipletests
from skbio.stats.distance import permanova, permdisp
from skbio import DistanceMatrix
from skbio.stats.ordination import pcoa
from skbio.stats.composition import clr, multi_replace
import warnings
from datetime import datetime, timezone
import json
warnings.filterwarnings('ignore') # Suppress warnings for cleaner output
# ----------------------- CONFIG -----------------------
DATA_PATH = '/content/merged well cytokine-microbiome.csv'
ID_COL = 'SampleID'
OUTPUT_DIR = '/content/analysis_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
# Logging function
def log(msg):
    print(f"[{datetime.now(timezone.utc).isoformat()}] {msg}")

# Load the dataset
log("Loading data...")
df = pd.read_csv(DATA_PATH)

# Drop duplicates based on SampleID
df = df.drop_duplicates(subset=ID_COL, keep='first')
log(f"Duplicate SampleIDs found. Dropping duplicate rows, keeping first occurrence.")
log(f"Data shape: {df.shape}")

# Exclude summary rows (e.g., geomean)
df = df[df['Plate'] != 'geomean']

# Identify columns
columns = df.columns.tolist()
microbe_start = columns.index('Abyssalbus')
microbe_end = columns.index('Plate')
microbe_cols = columns[microbe_start:microbe_end]
cytokine_start = columns.index('IL17F')
cytokine_end = columns.index('CollectionDate')
cytokine_cols = columns[cytokine_start:cytokine_end]
metadata_cols = columns[cytokine_end:]
log(f"Detected microbe columns: {len(microbe_cols)}; cytokines: {len(cytokine_cols)}; metadata: {len(metadata_cols)}")

# Convert to numeric and handle NaNs
df[microbe_cols] = df[microbe_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
df[cytokine_cols] = df[cytokine_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

# Set index
df = df.set_index(ID_COL)

# Compute relative abundances and CLR transform for all
log("Performing relative abundance and CLR transforms (with multi_replace) on full dataset.")
microbe_totals = df[microbe_cols].sum(axis=1)
rel_microbe_df = df[microbe_cols].div(microbe_totals, axis=0).replace([np.inf, -np.inf], 0).fillna(0)
clr_microbe_df = pd.DataFrame(
    clr(multi_replace(rel_microbe_df.values)),
    index=rel_microbe_df.index,
    columns=rel_microbe_df.columns
)

# Get unique sample types
sample_types = df['SampleType'].unique()
log(f"Detected sample types: {sample_types}")

# Alpha diversity functions
def shannon_diversity(row):
    p = row[row > 0]
    if p.empty or p.sum() == 0:
        return 0
    p = p / p.sum()
    return -np.sum(p * np.log(p))
def simpson_diversity(row):
    p = row[row > 0]
    if p.empty or p.sum() == 0:
        return 0
    p = p / p.sum()
    return 1 - np.sum(p**2)

# Loop over each sample type
for sample_type in sample_types:
    log(f"Processing sample type: {sample_type}")
    type_df = df[df['SampleType'] == sample_type].copy()
    if type_df.empty:
        log(f"No samples for {sample_type}. Skipping.")
        continue
    rel_type_microbe = rel_microbe_df.loc[type_df.index].copy()
    clr_type_microbe = clr_microbe_df.loc[type_df.index].copy()

    # Alpha diversity calculations
    log(f"Computing alpha diversity for {sample_type}...")
    diversity_df = pd.DataFrame({
        'Shannon_Diversity': rel_type_microbe.apply(shannon_diversity, axis=1),
        'Simpson_Diversity': rel_type_microbe.apply(simpson_diversity, axis=1)
    }, index=type_df.index)
    type_df = pd.concat([type_df, diversity_df], axis=1)

    # Create type-specific output dir
    type_output_dir = os.path.join(OUTPUT_DIR, sample_type.lower())
    os.makedirs(type_output_dir, exist_ok=True)

    # Visualize diversity by health status
    log(f"Visualizing Shannon diversity by health status for {sample_type}...")
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='CL4', y='Shannon_Diversity', data=type_df)
    plt.title(f'Shannon Diversity by Health Status ({sample_type} Samples)')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(type_output_dir, 'shannon_by_status.png'))
    plt.close()

    # Visualize class distribution
    log(f"Visualizing class distribution for {sample_type}...")
    plt.figure(figsize=(10, 6))
    class_counts = type_df['CL4'].value_counts()
    class_counts.plot(kind='bar')
    plt.title(f'Class Distribution in CL4 ({sample_type} Samples)')
    plt.xlabel('Health Status')
    plt.ylabel('Sample Count')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(type_output_dir, 'class_distribution.png'))
    plt.close()

    # PERMANOVA and PERMDISP with Aitchison distance
    log(f"Distance metric: aitchison for {sample_type}")
    log(f"Computing distance matrix for {sample_type}...")
    dist_array = pdist(clr_type_microbe.values, metric='euclidean')
    dm = DistanceMatrix(squareform(dist_array), ids=clr_type_microbe.index.tolist())
    log(f"Using group_col for PERMANOVA: CL4 for {sample_type}")
    log(f"Running PERMANOVA for {sample_type}...")
    common_ids = [id_ for id_ in dm.ids if id_ in type_df.index]
    if not common_ids:
        log(f"No common IDs for {sample_type}. Skipping PERMANOVA and PERMDISP.")
        permanova_result = {'test statistic': np.nan, 'p-value': np.nan}
        permdisp_result = {'test statistic': np.nan, 'p-value': np.nan}
    else:
        dm_filtered = dm.filter(common_ids)
        grouping = type_df.loc[common_ids, 'CL4'].astype(str)
        if len(grouping) < 2 or len(grouping.unique()) < 2:
            log(f"Insufficient samples or groups for PERMANOVA in {sample_type}. Skipping.")
            permanova_result = {'test statistic': np.nan, 'p-value': np.nan}
        else:
            permanova_result = permanova(dm_filtered, grouping=grouping, permutations=999)
        log(f"PERMANOVA complete for {sample_type}. Result saved.")
        with open(os.path.join(type_output_dir, 'permanova_result.json'), 'w') as f:
            json.dump({
                'test_statistic': float(permanova_result['test statistic']),
                'p_value': float(permanova_result['p-value']),
                'permutations': 999
            }, f, indent=2)
        log(f"Running PERMDISP for {sample_type}...")
        pcoa_result = pcoa(dm_filtered)
        coords = pcoa_result.samples
        coords['CL4'] = type_df.loc[coords.index, 'CL4']  # Ensure index alignment
        centroid_dfs = []
        for g, sub in coords.groupby('CL4'):
            if sub.shape[0] < 2:  # Skip groups with insufficient samples
                continue
            center = sub.iloc[:, :-1].mean(axis=0).values
            arr = sub.iloc[:, :-1].values
            dists = np.linalg.norm(arr - center[np.newaxis, :], axis=1)
            centroid_dfs.append(pd.DataFrame({'group': [g] * len(dists), 'distance': dists}))
        if not centroid_dfs:
            log(f"No valid groups for PERMDISP in {sample_type}. Skipping.")
            centroid_df = pd.DataFrame(columns=['group', 'distance'])
            permdisp_result = {'test statistic': np.nan, 'p-value': np.nan}
        else:
            centroid_df = pd.concat(centroid_dfs, ignore_index=True)
            if len(grouping.unique()) < 2:
                log(f"Insufficient groups for PERMDISP in {sample_type}. Skipping.")
                permdisp_result = {'test statistic': np.nan, 'p-value': np.nan}
            else:
                permdisp_result = permdisp(dm_filtered, grouping=grouping, permutations=999)
        centroid_df.to_csv(os.path.join(type_output_dir, 'permdisp_distances.csv'), index=False)
        with open(os.path.join(type_output_dir, 'permdisp_result.json'), 'w') as f:
            json.dump({
                'test_statistic': float(permdisp_result['test statistic']),
                'p_value': float(permdisp_result['p-value']),
                'permutations': 999
            }, f, indent=2)
        log(f"PERMDISP results saved for {sample_type}.")

    # Correlation analysis
    log(f"Performing correlation analysis for {sample_type}...")
    abundant_microbes = [col for col in microbe_cols if rel_type_microbe[col].mean() > 0.01]
    log(f"Keeping {len(abundant_microbes)} taxa (prevalence >= 0.01) out of {len(microbe_cols)} for {sample_type}")
    corrs = []
    pvals = []
    pairs = []
    for m in abundant_microbes:
        for c in cytokine_cols:
            r, p = spearmanr(clr_type_microbe[m], type_df[c], nan_policy='omit')
            pairs.append((m, c))
            corrs.append(r)
            pvals.append(p)
    _, corrected_pvals, _, _ = multipletests(pvals, method='fdr_bh')
    corr_df = pd.DataFrame({
        'Microbe': [p[0] for p in pairs],
        'Cytokine': [p[1] for p in pairs],
        'Correlation': corrs,
        'P-value': pvals,
        'Corrected_P-value': corrected_pvals
    })
    significant_corrs = corr_df[corr_df['Corrected_P-value'] < 0.05]
    significant_corrs.to_csv(os.path.join(type_output_dir, 'significant_correlations.csv'), index=False)
    log(f"Significant correlations saved for {sample_type}.")

    # Heatmap of significant correlations
    if not significant_corrs.empty:
        log(f"Generating correlation heatmap for {sample_type}...")
        pivot_corr = significant_corrs.pivot(index='Microbe', columns='Cytokine', values='Correlation').fillna(0)
        plt.figure(figsize=(12, 10))
        sns.heatmap(pivot_corr, cmap='coolwarm', annot=True, fmt='.2f')
        plt.title(f'Significant Spearman Correlations between Microbes and Cytokines ({sample_type})')
        plt.savefig(os.path.join(type_output_dir, 'corr_heatmap.png'))
        plt.close()

    # Differential abundance analysis (Healthy vs Infection)
    log(f"Performing differential abundance analysis for {sample_type}...")
    healthy_type = type_df[type_df['CL4'] == 'Healthy'].copy()
    infection_type = type_df[type_df['CL4'] == 'Infection'].copy()
    diff_abund = []
    diff_pvals = []
    valid_microbes = []
    for m in abundant_microbes:
        if healthy_type[m].sum() > 0 or infection_type[m].sum() > 0:
            stat, p = mannwhitneyu(healthy_type[m], infection_type[m], alternative='two-sided')
            diff_abund.append(healthy_type[m].mean() - infection_type[m].mean())
            diff_pvals.append(p)
            valid_microbes.append(m)
    _, diff_corrected_p, _, _ = multipletests(diff_pvals, method='fdr_bh')
    diff_df = pd.DataFrame({
        'Microbe': valid_microbes,
        'Mean_Diff': diff_abund,
        'P-value': diff_pvals,
        'Corrected_P-value': diff_corrected_p
    })
    significant_diff = diff_df[diff_df['Corrected_P-value'] < 0.05].sort_values('Corrected_P-value')
    significant_diff.to_csv(os.path.join(type_output_dir, 'differential_abundance.csv'), index=False)
    log(f"Differential abundance results saved for {sample_type}.")

    # Visualize top differentially abundant microbes
    if not significant_diff.empty:
        log(f"Visualizing top differentially abundant microbes for {sample_type}...")
        top_diff = significant_diff.head(10)
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Mean_Diff', y='Microbe', data=top_diff)
        plt.title(f'Top Differentially Abundant Microbes (Healthy vs Infection) - {sample_type}')
        plt.xlabel('Mean Difference (Healthy - Infection)')
        plt.savefig(os.path.join(type_output_dir, 'diff_abundance.png'))
        plt.close()

    # Machine Learning: Predict health status
    log(f"Training Random Forest classifier for {sample_type}...")
    features = pd.concat([clr_type_microbe, type_df[cytokine_cols]], axis=1)
    target = type_df['CL4']

    # Filter classes with <5 samples
    class_counts = target.value_counts()
    valid_classes = class_counts[class_counts >= 5].index
    filtered_type_df = type_df[type_df['CL4'].isin(valid_classes)].copy()
    filtered_features = features.loc[filtered_type_df.index].copy()
    log(f"Classes after filtering (minimum 5 samples) for {sample_type}: {valid_classes.tolist()}")

    # Update features and target
    features = filtered_features
    target = filtered_type_df['CL4']
    if len(target.unique()) < 2 or len(target) < 10:
        log(f"Insufficient classes or samples for ML in {sample_type}. Skipping.")
        continue

    # Encode target
    le = LabelEncoder()
    target_enc = le.fit_transform(target)

    # Standardize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)

    # Apply SMOTE with adjusted k_neighbors
    min_k_neighbors = min(min(class_counts[valid_classes]), 5) - 1
    min_k_neighbors = max(1, min_k_neighbors)  # Ensure at least 1 neighbor
    smote = SMOTE(random_state=RANDOM_STATE, k_neighbors=min_k_neighbors)
    X_resampled, y_resampled = smote.fit_resample(features_scaled, target_enc)

    # Train-test split with stratification
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=RANDOM_STATE, stratify=y_resampled)

    # Get unique classes in y_test
    test_classes = np.unique(y_test)
    test_class_names = le.inverse_transform(test_classes)

    # Random Forest with hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5]
    }
    rf = RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced')
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_rf = grid_search.best_estimator_
    log(f"Best Random Forest Parameters for {sample_type}: {grid_search.best_params_}")

    # Predict and evaluate
    y_pred = best_rf.predict(X_test)
    log(f"Random Forest Accuracy for {sample_type}: {accuracy_score(y_test, y_pred):.3f}")
    with open(os.path.join(type_output_dir, 'rf_classification_report.txt'), 'w') as f:
        f.write(classification_report(y_test, y_pred, target_names=test_class_names, zero_division=0))

    # Feature importances
    importances = pd.DataFrame({
        'Feature': features.columns,
        'Importance': best_rf.feature_importances_
    }).sort_values('Importance', ascending=False).head(20)
    importances.to_csv(os.path.join(type_output_dir, 'feature_importances.csv'), index=False)
    log(f"Feature importances saved for {sample_type}.")
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=importances)
    plt.title(f'Top 20 Feature Importances for Predicting Health Status ({sample_type})')
    plt.savefig(os.path.join(type_output_dir, 'feature_importances.png'))
    plt.close()

    # Dimensionality reduction for visualization
    log(f"Performing PCA and t-SNE for visualization in {sample_type}...")
    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(features_scaled)
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(pca_features[:, 0], pca_features[:, 1], c=target_enc, cmap='viridis')
    plt.colorbar(scatter, ticks=range(len(le.classes_)), label='Status', format=lambda i, _: le.classes_[int(i)])
    plt.title(f'PCA of Microbiome and Cytokine Data ({sample_type} Samples)')
    plt.savefig(os.path.join(type_output_dir, 'pca_plot.png'))
    plt.close()
    tsne = TSNE(n_components=2, random_state=RANDOM_STATE)
    tsne_features = tsne.fit_transform(features_scaled)
    plt.figure(figsize=(10, 6))
    scatter = plt.scatter(tsne_features[:, 0], tsne_features[:, 1], c=target_enc, cmap='viridis')
    plt.colorbar(scatter, ticks=range(len(le.classes_)), label='Status', format=lambda i, _: le.classes_[int(i)])
    plt.title(f't-SNE of Microbiome and Cytokine Data ({sample_type} Samples)')
    plt.savefig(os.path.join(type_output_dir, 'tsne_plot.png'))
    plt.close()

    # Network analysis
    log(f"Building microbe-cytokine correlation network for {sample_type}...")
    G = nx.Graph()
    G.add_nodes_from(abundant_microbes, type='microbe')
    G.add_nodes_from(cytokine_cols, type='cytokine')
    for idx, row in significant_corrs.iterrows():
        if abs(row['Correlation']) > 0.3:
            G.add_edge(row['Microbe'], row['Cytokine'], weight=row['Correlation'])
    plt.figure(figsize=(12, 10))
    pos = nx.spring_layout(G)
    node_colors = ['blue' if G.nodes[n]['type'] == 'microbe' else 'red' for n in G.nodes]
    nx.draw(G, pos, with_labels=True, node_color=node_colors, edge_color='gray', node_size=500, font_size=8)
    plt.title(f'Microbe-Cytokine Correlation Network (|r| > 0.3, FDR p < 0.05) - {sample_type}')
    plt.savefig(os.path.join(type_output_dir, 'network.png'))
    plt.close()
    nx.write_gml(G, os.path.join(type_output_dir, 'correlation_network.gml'))
    log(f"Correlation network saved for {sample_type}.")

    # Temporal analysis
    log(f"Performing temporal analysis for {sample_type}...")
    type_df['CollectionDate'] = pd.to_datetime(type_df['CollectionDate'], errors='coerce')
    time_analysis = type_df.groupby([type_df['CollectionDate'].dt.year, 'CL4'])[['Shannon_Diversity'] + cytokine_cols].mean().reset_index()
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=time_analysis, x='CollectionDate', y='Shannon_Diversity', hue='CL4')
    plt.title(f'Shannon Diversity Over Time by Health Status ({sample_type})')
    plt.xticks(rotation=45)
    plt.savefig(os.path.join(type_output_dir, 'temporal_diversity.png'))
    plt.close()

# Combined analyses across sample types
log("Performing combined analyses across all sample types...")
# Combined t-SNE colored by SampleType and CL4
features_all = pd.concat([clr_microbe_df, df[cytokine_cols]], axis=1)
target_all = df['CL4']
sample_type_all = df['SampleType']
le_cl4 = LabelEncoder()
target_enc_all = le_cl4.fit_transform(target_all)
le_type = LabelEncoder()
type_enc_all = le_type.fit_transform(sample_type_all)
scaler_all = StandardScaler()
features_scaled_all = scaler_all.fit_transform(features_all)
tsne_all = TSNE(n_components=2, random_state=RANDOM_STATE)
tsne_features_all = tsne_all.fit_transform(features_scaled_all)
plt.figure(figsize=(10, 6))
scatter = plt.scatter(tsne_features_all[:, 0], tsne_features_all[:, 1], c=type_enc_all, cmap='viridis')
plt.colorbar(scatter, ticks=range(len(le_type.classes_)), label='SampleType', format=lambda i, _: le_type.classes_[int(i)])
plt.title('t-SNE of Microbiome and Cytokine Data (All Samples, Colored by SampleType)')
plt.savefig(os.path.join(OUTPUT_DIR, 'combined_tsne_by_type.png'))
plt.close()
plt.figure(figsize=(10, 6))
scatter = plt.scatter(tsne_features_all[:, 0], tsne_features_all[:, 1], c=target_enc_all, cmap='viridis')
plt.colorbar(scatter, ticks=range(len(le_cl4.classes_)), label='Status', format=lambda i, _: le_cl4.classes_[int(i)])
plt.title('t-SNE of Microbiome and Cytokine Data (All Samples, Colored by CL4)')
plt.savefig(os.path.join(OUTPUT_DIR, 'combined_tsne_by_cl4.png'))
plt.close()

# Combined PERMANOVA with SampleType as group
log("Running combined PERMANOVA with SampleType as grouping...")
dist_array_all = pdist(clr_microbe_df.values, metric='euclidean')
dm_all = DistanceMatrix(squareform(dist_array_all), ids=clr_microbe_df.index.tolist())
common_ids_all = [id_ for id_ in dm_all.ids if id_ in df.index]
dm_filtered_all = dm_all.filter(common_ids_all)
grouping_all = df.loc[common_ids_all, 'SampleType'].astype(str)
if len(grouping_all.unique()) >= 2:
    permanova_all = permanova(dm_filtered_all, grouping=grouping_all, permutations=999)
    with open(os.path.join(OUTPUT_DIR, 'combined_permanova_by_type.json'), 'w') as f:
        json.dump({
            'test_statistic': float(permanova_all['test statistic']),
            'p_value': float(permanova_all['p-value']),
            'permutations': 999
        }, f, indent=2)

# Summary report
log("Writing summary report...")
report_lines = []
report_lines.append(f"# Analysis Summary\nGenerated: {datetime.now(timezone.utc).isoformat()} UTC\n")
report_lines.append("## Data Overview")
report_lines.append(f"- Samples: {df.shape[0]}")
report_lines.append(f"- Sample Types: {', '.join(sample_types)}")
report_lines.append(f"- Microbe columns: {len(microbe_cols)}, Cytokines: {len(cytokine_cols)}, Metadata: {len(metadata_cols)}")
report_lines.append("\n## Output Structure")
report_lines.append("- Site-specific outputs in subfolders: stool, gut, mouth, nasal")
report_lines.append("- Combined outputs in root OUTPUT_DIR")
report_lines.append("\n## Key Combined Results")
if 'permanova_all' in locals():
    report_lines.append(f"- Combined PERMANOVA (by SampleType): Pseudo-F={permanova_all['test statistic']:.4f}, p={permanova_all['p-value']:.4f}")
report_text = "\n".join(report_lines)
with open(os.path.join(OUTPUT_DIR, 'summary_report.md'), 'w') as f:
    f.write(report_text)
log("Summary report saved.")
log("Analysis complete. Results saved in OUTPUT_DIR.")

[2025-10-05T11:44:41.405640+00:00] Loading data...
[2025-10-05T11:44:41.513672+00:00] Duplicate SampleIDs found. Dropping duplicate rows, keeping first occurrence.
[2025-10-05T11:44:41.514071+00:00] Data shape: (670, 308)
[2025-10-05T11:44:41.516942+00:00] Detected microbe columns: 234; cytokines: 66; metadata: 5
[2025-10-05T11:44:41.592107+00:00] Performing relative abundance and CLR transforms (with multi_replace) on full dataset.
[2025-10-05T11:44:41.613910+00:00] Detected sample types: ['Stool' 'Mouth' 'Nasal' 'Skin']
[2025-10-05T11:44:41.619789+00:00] Processing sample type: Stool
[2025-10-05T11:44:41.623994+00:00] Computing alpha diversity for Stool...
[2025-10-05T11:44:41.815962+00:00] Visualizing Shannon diversity by health status for Stool...
[2025-10-05T11:44:42.170745+00:00] Visualizing class distribution for Stool...
[2025-10-05T11:44:42.420453+00:00] Distance metric: aitchison for Stool
[2025-10-05T11:44:42.420559+00:00] Computing distance matrix for Stool...
[2025-10-05T1

In [None]:
import shutil
shutil.make_archive('/content/analysis_outputs', 'zip', '/content/analysis_outputs')

'/content/analysis_outputs.zip'