In [3]:
import os
import pandas as pd # type: ignore
import numpy as np # type: ignore
import plotly.express as px
import plotly.graph_objects as go

from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri, FloatVector

# Convert pandas.DataFrames to R dataframes automatically.
pandas2ri.activate()

try:
    os.chdir('/container/mount/point')
except FileNotFoundError:
    print("Warning: Directory '/container/mount/point' does not exist.")

from utils.preprocessing import filter_and_process_asv_table

In [4]:
utils = importr('utils')
divnet = importr('DivNet')
breakaway = importr('breakaway')

### KORA Dataset

In [5]:
# Load matched dataframe and ASV table
kora_matched_df = pd.read_csv("data/sample_df_ige_KORA.csv", index_col=0)
asv = pd.read_csv("data/feature_table.tsv", index_col=0, sep='\t')
print(f"ASV table shape (features, samples): {asv.shape}")

# Ensure sample IDs are strings for both DataFrames
kora_ids = kora_matched_df.index.astype(str)
asv_ids = asv.columns.astype(str)

# Select intersecting sample IDs
common_ids = kora_ids.intersection(asv_ids)
asv_matched = asv.loc[:, asv.columns.astype(str).isin(common_ids)]
print(f"Matched ASV table shape: {asv_matched.shape}")

# Filter and process ASV table
asv_top99_samples, asv_samples_ids = filter_and_process_asv_table(asv_matched, freq_threshold=0.01)
print(f"Filtered ASV table shape: {asv_top99_samples.shape}")

# Filter kora_matched_df to keep only samples in asv_top99_samples
filtered_ids = asv_top99_samples.columns.astype(str)
filtered_kora_matched_df = kora_matched_df.loc[kora_matched_df.index.astype(str).isin(filtered_ids)]
print(f"Filtered kora_matched_df shape: {filtered_kora_matched_df.shape}")

# Sort ASV table columns to match the order of samples in filtered_kora_matched_df
ordered_sample_ids = filtered_kora_matched_df.index.astype(str).tolist()
asv_top99_samples = asv_top99_samples[ordered_sample_ids]

# Save results
filtered_kora_matched_df.to_csv("data/filtered_kora_matched_df.csv", index=True)
asv_top99_samples.to_csv("data/filtered_kora_asv_top99_samples.csv", index=True)

ASV table shape (features, samples): (15170, 2034)
Matched ASV table shape: (15170, 445)
These columns have not variance and will be dropped: Index(['33231', '50139'], dtype='object')
Filtered ASV table shape: (1461, 443)
Filtered kora_matched_df shape: (443, 80)


In [13]:
taxonomy_levels = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# Load taxonomy file
taxonomy_df = pd.read_csv("data/taxonomy_clean.csv", index_col=0)

# Ensure index names match
asv_top99_samples.index.name = taxonomy_df.index.name

# Join ASV table with taxonomy
join_asv_taxa = asv_top99_samples.join(taxonomy_df)

# Aggregate ASV table by taxonomic levels
taxa_dict = {}
freq_dict = {}

# Join 'name' to ASV table
ASV_table_with_taxon = asv_top99_samples.join(taxonomy_df['name'])

for level in taxonomy_df.columns.difference(['name']):
    # Join, group by level, and sum
    df_level = asv_top99_samples.join(taxonomy_df[level]).groupby(level).sum()
    
    # Count non-zero samples per taxon
    non_zero_counts = (df_level != 0).sum(axis=1)
    non_zero_counts_sorted = non_zero_counts.sort_values(ascending=False).reset_index()
    non_zero_counts_sorted.columns = [level, 'count']
    freq_dict[level] = non_zero_counts_sorted
    
    taxa_dict[level] = df_level

taxa_dict["ASV"] = asv_top99_samples

for level in taxa_dict.keys():
    print(f"{level} count table shape: {taxa_dict[level].shape}")

pd.set_option('display.max_colwidth', None)

class count table shape: (16, 443)
domain count table shape: (2, 443)
family count table shape: (80, 443)
genus count table shape: (397, 443)
order count table shape: (41, 443)
phylum count table shape: (10, 443)
species count table shape: (1349, 443)
ASV count table shape: (1461, 443)


In [None]:
lvl = "family"
dataset = "kora"
print(freq_dict[lvl][~freq_dict[lvl][lvl].str.contains('unknown')].head(10))

base_taxon = 'f__Lachnospiraceae;'

# Run divnet (samples as rows, taxa as columns)
divnet_stat = divnet.divnet(taxa_dict['family'].T, base=base_taxon, ncores=4)

shannon = divnet_stat[0]

div_net_dict = {}

for i in range(len(shannon)):
    sample_id = shannon.names[i]
    estimate = round(float(shannon[i][0]), 4)
    error = round(float(shannon[i][1]), 4)
    div_net_dict[sample_id] = (estimate, error)

div_net_pd = pd.DataFrame.from_dict(div_net_dict, orient='index', columns=['estimate', 'error'])

div_net_pd.to_csv(f"data/divnet_{dataset}.csv")

div_net_pd = pd.read_csv(f"data/divnet_{dataset}.csv", index_col=0)

                                      family  count
0                        f__Lachnospiraceae;    443
1                       f__Oscillospiraceae;    443
2                        f__Ruminococcaceae;    442
3                         f__Bacteroidaceae;    441
4                          f__Rikenellaceae;    429
5                         f__Sutterellaceae;    415
6                         f__Tannerellaceae;    411
7  f__[Eubacterium]_coprostanoligenes_group;    403
8              f__Erysipelatoclostridiaceae;    388
9                        f__Eggerthellaceae;    380
  |                                                                      |   0%

"Ecologists who are interested in the way species richness varies with covariate information often run a regression-type analysis on the observed diversity using their covariate information as predictors. However, in many settings (especially microbial), rare and unobserved taxa play a hugely important role in explaining the subtleties of the ecosystem, however, a regression analysis on the observed diversity level fails to account for these unobserved taxa. By predicting the total level of diversity (for example, via breakaway) and estimating the standard error in the estimate, one can take account of these unobserved, but important, taxa. In order to account for the estimated nature of the response, a mixed model approach is taken, whereby the varying levels of confidence in the estimates contributes to a diagonal but heteroscedastic covariance matrix. Given covariates constitute the fixed effects in the mixed model, and significance of the random effect term sigsq_u reflects heterogeneity in the sample, that is, variability that cannot be explained by only the covariates. The authors believe this to be the first attempt at modelling total diversity in a way that accounts for its estimated nature".

This function tests for heterogeneity of total diversity (observed plus unobserved) across multiple sites. It can account or test for fixed effects that may explain diversity. It returns the significance of the covariates in explaining diversity and a hypothesis test for heterogeneity.

In [None]:
# Convert estimates and errors to R vectors
estimates = FloatVector(div_net_pd['estimate'])
errors = FloatVector(div_net_pd['error'])

# Prepare design matrix with intercept
W = filtered_kora_matched_df['W'].reset_index(drop=True)
intercept = pd.Series(1, index=W.index, name='intercept')
design_matrix = pd.concat([intercept, W], axis=1)

# Run breakaway betta
betta_result = breakaway.betta(chats=estimates, ses=errors, X=design_matrix)

# Create results DataFrame
betta_table = pd.DataFrame(betta_result[0], columns=['estimate', 'error', 'p_value'])
observed_estimate = betta_table.loc[1, 'estimate']
print(f" Beta table: {betta_table}")

# Save results
betta_table.to_csv(f'data/plugin_alphadiv_{dataset}.csv', index=False)

In [None]:
# Prepare smoking status DataFrame
smoking_df = filtered_kora_matched_df[['u3_16s_id', 'W_str']].set_index('u3_16s_id')
smoking_df.index = smoking_df.index.astype(int)

# Ensure matching index types for join
div_net_pd.index = div_net_pd.index.astype(int)

# Join alpha diversity with smoking status
alpha_div_df = div_net_pd.join(smoking_df)
alpha_div_df.rename(columns={'W_str': 'Smoker'}, inplace=True)

# Create boxplot
fig = px.box(
    alpha_div_df,
    x="Smoker",
    y="estimate",
    color="Smoker",
    color_discrete_sequence=["green", "red"],
    width=500,
    height=800
)

fig.write_image("plots/png/alpha_diversity_KORA_smoking.png")

fig.show()