In [None]:
import os
import pandas as pd # type: ignore
import numpy as np # type: ignore
import plotly.express as px
import plotly.graph_objects as go

from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri, FloatVector

from utils.preprocessing import filter_and_process_asv_table

# Convert pandas.DataFrames to R dataframes automatically.
pandas2ri.activate()

try:
    os.chdir('/container/mount/point')
except FileNotFoundError:
    print("Warning: Directory '/container/mount/point' does not exist.")

In [33]:
base = importr('base')
utils = importr('utils')
devtools = importr('devtools')
breakaway = importr('breakaway')

### KORA Dataset

In [52]:
# Load matched dataframe and ASV table
kora_matched_df = pd.read_csv("data/sample_df_ige_KORA.csv", index_col=0)
asv = pd.read_csv("data/feature_table.tsv", index_col=0, sep='\t')
print(f"ASV table shape (features, samples): {asv.shape}")

# Ensure sample IDs are strings for both DataFrames
kora_ids = kora_matched_df.index.astype(str)
asv_ids = asv.columns.astype(str)

# Select intersecting sample IDs
common_ids = kora_ids.intersection(asv_ids)
asv_matched = asv.loc[:, asv.columns.astype(str).isin(common_ids)]
print(f"Matched ASV table shape: {asv_matched.shape}")

# Filter and process ASV table
asv_top99_samples, asv_samples_ids = filter_and_process_asv_table(asv_matched, freq_threshold=0.01)
print(f"Filtered ASV table shape: {asv_top99_samples.shape}")

# Filter kora_matched_df to keep only samples in asv_top99_samples
filtered_ids = asv_top99_samples.columns.astype(str)
filtered_kora_matched_df = kora_matched_df.loc[kora_matched_df.index.astype(str).isin(filtered_ids)]
print(f"Filtered kora_matched_df shape: {filtered_kora_matched_df.shape}")

# Save results
filtered_kora_matched_df.to_csv("data/filtered_kora_matched_df.csv", index=True)
asv_top99_samples.to_csv("data/filtered_kora_asv_top99_samples.csv", index=True)

ASV table shape (features, samples): (15170, 2034)
Matched ASV table shape: (15170, 445)
These columns have not variance and will be dropped: Index(['33231', '50139'], dtype='object')
Filtered ASV table shape: (1461, 443)
Filtered kora_matched_df shape: (443, 80)


### Richness estimation

‘breakaway’ presents an estimator of species richness that is well-suited to the high-diversity/microbial setting. However, many microbial datasets display more diversity than the Kemp-type models can permit. In this case, the log-transformed WLRM diversity estimator of Rocchetti et. al. (2011) is returned. The authors’ experience suggests that some datasets that require the log-transformed WLRM contain “false” diversity, that is, diversity attributable to sequencing errors (via an inflated singleton count). The authors encourage judicious use of diversity estimators when the dataset may contain these errors, and recommend the use of breakaway_nof1 as an exploratory tool in this case.

Ecologists who are interested in the way species richness varies with covariate information often run a regression-type analysis on the observed diversity using their covariate information as predictors. However, in many settings (especially microbial), rare and unobserved taxa play a hugely important role in explaining the subtleties of the ecosystem, however, a regression analysis on the observed diversity level fails to account for these unobserved taxa. By predicting the total level of diversity (for example, via breakaway) and estimating the standard error in the estimate, one can take account of these unobserved, but important, taxa. In order to account for the estimated nature of the response, a mixed model approach is taken, whereby the varying levels of confidence in the estimates contributes to a diagonal but heteroscedastic covariance matrix. Given covariates constitute the fixed effects in the mixed model, and significance of the random effect term sigsq_u reflects heterogeneity in the sample, that is, variability that cannot be explained by only the covariates. The authors believe this to be the first attempt at modelling total diversity in a way that accounts for its estimated nature.

In [61]:
# Run breakaway and summarize results
ba = breakaway.breakaway(asv_top99_samples)
summary = base.summary(ba)

# Convert summary to dictionary and round estimates/errors
sum_dict = dict(zip(summary.names, map(list, summary)))
estimate = np.round(sum_dict['estimate'], 4)
error = np.round(sum_dict['error'], 4)

# Prepare vectors for betta
es = FloatVector(estimate)
er = FloatVector(error)

# Prepare design matrix with intercept
W = filtered_kora_matched_df['W'].reset_index(drop=True)
x = pd.DataFrame({'intercept': 1, 'W': W})

# Run betta analysis
betta = breakaway.betta(chats=es, ses=er, X=x)

# Create results table and save
betta_table = pd.DataFrame(betta[0], columns=['est', 'err', 'p-values'], index=x.columns)
print(betta_table)
betta_table.to_csv("data/betta_table.csv")

                  est       err  p-values
intercept  123.480469  1.690058     0.000
W           -7.954298  2.378187     0.001


In [75]:
# Prepare allergy_df equivalent for smoking
smoking_df = filtered_kora_matched_df[['W_str']].copy()
smoking_df.index = filtered_kora_matched_df.index.astype(int)

# Prepare richness dataframe
rich_df = pd.DataFrame(sum_dict)
rich_df.index = rich_df["sample_names"].astype(int)

# Join smoking info to richness dataframe
rich_df = rich_df.join(smoking_df)
rich_df.rename(columns={'W_str': 'Smoking'}, inplace=True)

# Count frequency of each smoking group
value_counts = rich_df['Smoking'].value_counts()

# Create boxplot
fig = px.box(rich_df, x="Smoking", y="estimate", color="Smoking",
             color_discrete_sequence=["green", "red"])

annotations = []
for index, count in value_counts.items():
    annotations.append(
        go.layout.Annotation(
            x=index,
            y=count + 1,
            text=str(count),
            showarrow=False,
            font=dict(size=12),
        )
    )

fig.update_layout(
    title="Richness box-plot KORA (smoking)",
    width=500,
    height=800,
    annotations=annotations
)
fig.show()
# fig.write_image("plots/png/richness_KORA_smoking.png")