In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve, auc
from scipy import interp
import matplotlib.pyplot as plt
import numpy as np
import os

def do_classifier1(X, y, title='Disease', plot_shap=False,ax=None):
    y = y[y.index.isin(X.index)].sort_index()
    X = X[X.index.isin(y.index)].sort_index()
    classifier = GradientBoostingClassifier(n_estimators=2500, learning_rate=.01, max_depth=6,
                                            max_features=1, min_samples_leaf=10)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=104)
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)
    if ax is None:
        fig, ax = plt.subplots(figsize=(90/25, 90/25), dpi=300)
    ax.set_facecolor('none')
    ax.spines['top'].set_visible(True)
    ax.spines['right'].set_visible(True)
    ax.spines['bottom'].set_visible(True)
    ax.spines['left'].set_visible(True)
    ax.grid(True, linestyle=':', alpha=0.7)
    
    for i, (train, test) in enumerate(cv.split(X, y)):
        fitted = classifier.fit(X.iloc[train], y.iloc[train])
        probas_ = fitted.predict_proba(X.iloc[test])
        fpr, tpr, _ = roc_curve(y.iloc[test], probas_[:, 1])
        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0
        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        ax.plot(fpr, tpr, lw=1, alpha=0.3, color='gray')

    ax.plot([0, 1], [0, 1], linestyle='--', lw=1.5, color='r', label='Chance', alpha=.8)
    
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=f'Mean ROC (AUC = {mean_auc:.2f} ± {std_auc:.2f})',
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='lightgray', alpha=.2,
                    label=r'± 1 std. dev.')

    ax.set_xlim([-0.05, 1.05])
    ax.set_ylim([-0.05, 1.05])
    ax.set_xlabel('False Positive Rate', fontsize=7)
    ax.set_ylabel('True Positive Rate', fontsize=7)
    ax.set_title(f'Prediction of {title} from Microbiome', fontsize=7)
    ax.legend(loc="lower right", fontsize=6)
    ax.grid(True, linestyle=':', color='gray', alpha=0.5, linewidth=0.5)
    
    ax.tick_params(axis='both', which='major', labelsize=6)
    
    plt.tight_layout()
    #plt.show()
    
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib.ticker import FormatStrFormatter
from matplotlib.ticker import FuncFormatter
def boxplot_with_mannwhitney_final(data1, data2, labels, title, ax=None, save=False, exp_labels=False):
    # Perform Mann-Whitney U test
    statistic, p_value = stats.mannwhitneyu(data1, data2, alternative='two-sided')
    
    if ax is None:
        # Create a figure and axis
        fig, ax = plt.subplots(figsize=(180 / 25.4 / 6 * 1.3, 2), dpi=300)
    
    # Create the boxplot
    bp = ax.boxplot([data1, data2], labels=labels, showfliers=False, widths=0.6)
    
    # Customize the plot
    ax.set_ylabel(title, fontsize=6)
    ax.set_xlabel('', fontsize=6)

    # Set style for median lines
    for median in bp['medians']:
        median.set(color='black', linewidth=1.5)
    
    # Set style for whiskers and caps
    for element in ['whiskers', 'caps']:
        for item in bp[element]:
            item.set(color='black', linewidth=1)
    
    # Find the highest and lowest point of the whiskers
    y_max = max(
        max(bp['whiskers'][0].get_ydata()),
        max(bp['whiskers'][1].get_ydata()),
        max(bp['whiskers'][2].get_ydata()),
        max(bp['whiskers'][3].get_ydata())
    )
    
    y_min = min(
        min(bp['whiskers'][0].get_ydata()),
        min(bp['whiskers'][1].get_ydata()),
        min(bp['whiskers'][2].get_ydata()),
        min(bp['whiskers'][3].get_ydata())
    )
    
    # Add asterisks based on the p-value
    if p_value < 0.001:
        annotation = '***'  # p < 0.001
    elif p_value < 0.01:
        annotation = '**'   # p < 0.01
    elif p_value < 0.05:
        annotation = '*'    # p < 0.05
    else:
        annotation = 'n.s.'  # not significant
    
    # Determine where to place the horizontal line and annotation
    line_height = y_max + (abs(y_max - y_min) * 0.05)  # 5% above the highest whisker
    annotation_height = y_max + (abs(y_max - y_min) * 0.08)  # 8% above the highest whisker
    
    # Display the asterisks above the boxes
    ax.text(1.5, annotation_height, annotation,  # Use the adjusted height for annotation
            horizontalalignment='center', fontweight='bold', fontsize=6)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    
    # Add a line connecting the boxes
    x1, x2 = 1, 2
    ax.plot([x1, x1, x2, x2], [line_height, annotation_height, annotation_height, line_height], lw=0.5, c='black')
    
    # Adjust y-axis limits to ensure room for the annotation and horizontal line
    ax.set_ylim(bottom=y_min - abs(y_max - y_min) * 0.1,  # Add space below the minimum whisker
                top=annotation_height + abs(y_max - y_min) * 0.1) 
    # Add a small buffer space above the annotation
    if exp_labels:
        def exp10_format(y, pos):
            return f"$10^{{{int(y)}}}$"
        ax.yaxis.set_major_formatter(FuncFormatter(exp10_format))
    
    
    # Set background color and frame
    ax.set_facecolor('white')
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_color('black')
        spine.set_linewidth(0.5)
    
    # Add grid
    ax.grid(True, axis='y', linestyle=':', color='gray', alpha=0.5, linewidth=0.5)
    
    # Adjust tick parameters
    ax.tick_params(axis='both', which='major', labelsize=5, length=3, width=0.5)
    plt.xticks(rotation=45)
    ax.set_xticklabels(labels, fontweight='normal', fontsize=6)
    
    plt.tight_layout()
    
    if save:
        plt.savefig('{}.png'.format(title[:5]), 
                    dpi=300, bbox_inches='tight')


In [None]:
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from skbio.diversity.alpha import shannon
import matplotlib.gridspec as gridspec


# Set the style to match Nature Medicine guidelines
plt.style.use('default')
plt.rcParams.update({
 'font.sans-serif': ['Helvetica'],
    'font.size': 7,
    'axes.linewidth': 0.5,
    'axes.labelsize': 7,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'xtick.major.size': 3,
    'ytick.major.size': 3,
    'xtick.major.width': 0.5,
    'ytick.major.width': 0.5,
    'axes.spines.right': False,
    'axes.spines.top': False,
})

fig = plt.figure(figsize=(180 / 25.4,3))

# Create a GridSpec with 2 rows and 2 columns, with the bottom row spanning both columns

gs = gridspec.GridSpec(1, 4, width_ratios=[4,1.2,1.2,1])
# Top-left panel (Panel A)

set1=pd.read_csv('cancer_microbiome_abundances.csv', index_col=0)## microbiome species abundances of breast cancer patients

all_mb=pd.read_csv('all_hc_abundances.csv', index_col=0) ## microbiome species abundances of health control
basic_data=pd.read_csv('basic_data.csv', index_col=0) ####age gender bmi


from utils import find_match
cancer_index=set1.index
cancer_index=[rc for rc in cancer_index if rc in basic_data.index]
not_cancer_index=all_mb.index
not_cancer_index=[rc for rc in not_cancer_index if rc in basic_data.index]
cancer_data=basic_data.loc[cancer_index]
cancer_data['gender']=0  ###only females
cancer_data=cancer_data.dropna()
cancer_data['cancer']=True
not_cancer_data=basic_data.loc[not_cancer_index].dropna()
not_cancer_data['cancer']=False
df=cancer_data.append(not_cancer_data)
matched_df = find_match(df,TREATMENT='cancer', koef=5) ###matching 1 x 5 controls
matched_rc=list(matched_df.dropna(subset=['match']).index)
set2=all_mb[all_mb.index.isin(matched_rc)]

set1['cancer'] = 1
set2['cancer'] = 0
X=set1.append(set2).fillna(0.0001)
y = X['cancer']

del X['cancer']
X = X.apply(lambda x: np.log10(x))
# Bottom panel (Panel C, wider)
ax3 = fig.add_subplot(gs[0, 0])  # First column (wider)
do_classifier1(X.join(basic_data[['age', 'bmi']]).dropna(axis=1),y, title='Breast Cancer', ax=ax3)
# Adjust layout to prevent overlap

X1 = set1.append(set2).fillna(0.0001)


XX=X1
fecaulabacterium=X1[[col for col in X1.columns if 'Faecalibacterium' in col]].sum(axis=1).apply(lambda x: np.log10(x))
diversity=X1.apply(lambda x: shannon(x[x > 0.0001].values), axis=1)

XX['Faecalibacterium|']=fecaulabacterium
XX['Shannon Diversity| ']=diversity
XX['Fusicatenibacter|Fusicatenibacter saccharivorans|2249.0']=XX['Fusicatenibacter|Fusicatenibacter saccharivorans|2249.0'].apply(lambda x: np.log10(x))
for i,col in enumerate(['Faecalibacterium|', 'Shannon Diversity| ', 'Fusicatenibacter|Fusicatenibacter saccharivorans|2249.0']):
    ax = fig.add_subplot(gs[0, i+1])
    exp_labels=True
    if col=='Shannon Diversity| ':
        exp_labels=False
    boxplot_with_mannwhitney_final(XX.loc[list(set1.index),col], 
        XX.loc[list(set2.index),col],['BC', 'HC'], col.split('|')[0],ax=ax,exp_labels=exp_labels)


plt.subplots_adjust(left=0.05, right=3, top=0.2, bottom=0.1, wspace=0.2)
plt.tight_layout()

# Save or show the figure
plt.savefig('10k_predicting_cancer_all.png', dpi=300, bbox_inches='tight')

for endometriosis and IBD the code is identical