In [1]:
import os
import pandas as pd # type: ignore
import numpy as np # type: ignore
import plotly.express as px
import plotly.graph_objects as go

from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri, FloatVector

# Convert pandas.DataFrames to R dataframes automatically.
pandas2ri.activate()

try:
    os.chdir('/container/mount/point')
except FileNotFoundError:
    print("Warning: Directory '/container/mount/point' does not exist.")

from utils.preprocessing import filter_and_process_asv_table

In [2]:
utils = importr('utils')
divnet = importr('DivNet')
breakaway = importr('breakaway')

### KORA Dataset

In [4]:
# Load matched dataframe and ASV table
kora_matched_df = pd.read_csv("data/smoking_KORA_experiment.csv", index_col=0)
asv = pd.read_csv("data/filtered_count_table.csv", index_col=0)
print(f"KORA matched table shape (features, samples): {kora_matched_df.shape}")
print(f"ASV table shape (features, samples): {asv.shape}")

KORA matched table shape (features, samples): (436, 80)
ASV table shape (features, samples): (1469, 436)


In [5]:
taxonomy_levels = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

# Load taxonomy file
taxonomy_df = pd.read_csv("data/taxonomy_clean.csv", index_col=0)

# Ensure index names match
asv.index.name = taxonomy_df.index.name

# Join ASV table with taxonomy
join_asv_taxa = asv.join(taxonomy_df)

# Aggregate ASV table by taxonomic levels
taxa_dict = {}
freq_dict = {}

# Join 'name' to ASV table
ASV_table_with_taxon = asv.join(taxonomy_df['name'])

for level in taxonomy_df.columns.difference(['name']):
    # Join, group by level, and sum
    df_level = asv.join(taxonomy_df[level]).groupby(level).sum()
    
    # Count non-zero samples per taxon
    non_zero_counts = (df_level != 0).sum(axis=1)
    non_zero_counts_sorted = non_zero_counts.sort_values(ascending=False).reset_index()
    non_zero_counts_sorted.columns = [level, 'count']
    freq_dict[level] = non_zero_counts_sorted
    
    taxa_dict[level] = df_level

taxa_dict["ASV"] = asv

for level in taxa_dict.keys():
    print(f"{level} count table shape: {taxa_dict[level].shape}")

pd.set_option('display.max_colwidth', None)

class count table shape: (15, 436)
domain count table shape: (2, 436)
family count table shape: (80, 436)
genus count table shape: (401, 436)
order count table shape: (41, 436)
phylum count table shape: (9, 436)
species count table shape: (1354, 436)
ASV count table shape: (1469, 436)


In [None]:
lvl = "family"
dataset = "kora"
print(freq_dict[lvl][~freq_dict[lvl][lvl].str.contains('unknown')].head(10))

base_taxon = 'f__Lachnospiraceae;'

# Run divnet (samples as rows, taxa as columns)
divnet_stat = divnet.divnet(taxa_dict['family'].T, base=base_taxon, ncores=4)

shannon = divnet_stat[0]

div_net_dict = {}

for i in range(len(shannon)):
    sample_id = shannon.names[i]
    estimate = round(float(shannon[i][0]), 4)
    error = round(float(shannon[i][1]), 4)
    div_net_dict[sample_id] = (estimate, error)

div_net_pd = pd.DataFrame.from_dict(div_net_dict, orient='index', columns=['estimate', 'error'])

# div_net_pd.to_csv(f"data/divnet_{dataset}.csv")

                                      family  count
0                       f__Oscillospiraceae;    436
1                        f__Lachnospiraceae;    436
2                        f__Ruminococcaceae;    435
3                         f__Bacteroidaceae;    434
4                          f__Rikenellaceae;    424
5                         f__Sutterellaceae;    407
6                         f__Tannerellaceae;    404
7  f__[Eubacterium]_coprostanoligenes_group;    399
8              f__Erysipelatoclostridiaceae;    383
9                        f__Eggerthellaceae;    377


"Ecologists who are interested in the way species richness varies with covariate information often run a regression-type analysis on the observed diversity using their covariate information as predictors. However, in many settings (especially microbial), rare and unobserved taxa play a hugely important role in explaining the subtleties of the ecosystem, however, a regression analysis on the observed diversity level fails to account for these unobserved taxa. By predicting the total level of diversity (for example, via breakaway) and estimating the standard error in the estimate, one can take account of these unobserved, but important, taxa. In order to account for the estimated nature of the response, a mixed model approach is taken, whereby the varying levels of confidence in the estimates contributes to a diagonal but heteroscedastic covariance matrix. Given covariates constitute the fixed effects in the mixed model, and significance of the random effect term sigsq_u reflects heterogeneity in the sample, that is, variability that cannot be explained by only the covariates. The authors believe this to be the first attempt at modelling total diversity in a way that accounts for its estimated nature".

This function tests for heterogeneity of total diversity (observed plus unobserved) across multiple sites. It can account or test for fixed effects that may explain diversity. It returns the significance of the covariates in explaining diversity and a hypothesis test for heterogeneity.

In [9]:
dataset = "kora"
div_net_pd = pd.read_csv(f"data/divnet_{dataset}.csv", index_col=0)

# # Convert estimates and errors to R vectors
estimates = FloatVector(div_net_pd['estimate'])
errors = FloatVector(div_net_pd['error'])

# # Prepare design matrix with intercept
W = kora_matched_df['W'].reset_index(drop=True)
intercept = pd.Series(1, index=W.index, name='intercept')
design_matrix = pd.concat([intercept, W], axis=1)

# # Run breakaway betta
betta_result = breakaway.betta(chats=estimates, ses=errors, X=design_matrix)

# # Create results DataFrame
betta_table = pd.DataFrame(betta_result[0], columns=['estimate', 'error', 'p_value'])
observed_estimate = betta_table.loc[1, 'estimate']
print(f" Beta table: {betta_table}")

# # Save results
betta_table.to_csv(f'data/plugin_alphadiv_{dataset}.csv', index=False)

 Beta table:    estimate     error  p_value
0  2.022185  0.014004    0.000
1  0.028263  0.019804    0.154


In [17]:
# Prepare smoking status DataFrame
smoking_df = kora_matched_df[['W_str']].copy()
smoking_df.index = kora_matched_df.index.astype(int)

# Join alpha diversity with smoking status
alpha_div_df = div_net_pd.join(smoking_df)
alpha_div_df.rename(columns={'W_str': 'Smoking'}, inplace=True)

# Count frequency of each smoking group
value_counts = alpha_div_df['Smoking'].value_counts()

# Create boxplot
fig = px.box(
    alpha_div_df,
    x="Smoking",
    y="estimate",
    color="Smoking",
    color_discrete_sequence=["green", "red"],
    width=500,
    height=800
)

annotations = []
for index, count in value_counts.items():
    annotations.append(
        go.layout.Annotation(
            x=index,
            y=alpha_div_df[alpha_div_df['Smoking'] == index]['estimate'].max() + 0.05,
            text=str(count),
            showarrow=False,
            font=dict(size=12),
        )
    )

fig.update_layout(
    title="Alpha diversity box-plot KORA (smoking)",
    annotations=annotations
)

fig.write_image("plots/png/alpha_diversity_KORA_smoking.png")

fig.show()

![Alpha diversity KORA Smoking](../plots/png/alpha_diversity_KORA_smoking.png)