In [8]:
# Import necessary libraries
import cobra
import glob
import itertools
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
import random
import seaborn as sns
from cobra.medium import minimal_medium
from scipy.stats import mannwhitneyu
from scipy.stats import ks_2samp

# Set the path for the data
input_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Output_data/network_files/'
model_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Modeling/Models/'
output_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/5_Model_simulations/'
sample_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Output_data/phyloseq_source/'
figure_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Figures/'

In [9]:
# Check if all the models are present and have biomass
# Load the models
os.chdir(model_path)

# Create an empty DataFrame to store the results
model_info = pd.DataFrame(columns=["Model", "Predicted growth rate", "Number of Genes", "Number of Metabolites", "Number of Reactions"])

# Loop through each SBML file
for models in os.listdir(model_path):
    if models.endswith(".xml"): 
        model = cobra.io.read_sbml_model(models) 
        solution = model.optimize()
        model_info.loc[len(model_info)] = { 
            "Model": models, 
            "Predicted growth rate": solution.objective_value,
            "Number of Genes": len(model.genes),
            "Number of Metabolites": len(model.metabolites),
            "Number of Reactions": len(model.reactions)
        }

model_info.set_index('Model', inplace=True)
model_info['Predicted growth rate'] = model_info['Predicted growth rate'].round(3)

# Save the results
model_info.to_csv(output_path + 'model_info.csv', sep = ",", index = True)

# Display the results
model_info

Adding exchange reaction EX_12ppd__R_e with default bounds for boundary metabolite: 12ppd__R_e.
Adding exchange reaction EX_14glucan_e with default bounds for boundary metabolite: 14glucan_e.
Adding exchange reaction EX_26dap__M_e with default bounds for boundary metabolite: 26dap__M_e.
Adding exchange reaction EX_2mpa_e with default bounds for boundary metabolite: 2mpa_e.
Adding exchange reaction EX_3amp_e with default bounds for boundary metabolite: 3amp_e.
Adding exchange reaction EX_3cmp_e with default bounds for boundary metabolite: 3cmp_e.
Adding exchange reaction EX_3mba_e with default bounds for boundary metabolite: 3mba_e.
Adding exchange reaction EX_3ump_e with default bounds for boundary metabolite: 3ump_e.
Adding exchange reaction EX_4abut_e with default bounds for boundary metabolite: 4abut_e.
Adding exchange reaction EX_4hpro_LT_e with default bounds for boundary metabolite: 4hpro_LT_e.
Adding exchange reaction EX_LalaDgluMdap_e with default bounds for boundary metabolite

Unnamed: 0_level_0,Predicted growth rate,Number of Genes,Number of Metabolites,Number of Reactions
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abiotrophia_defectiva.xml,45.125,421,820,1154
Achromobacter_xylosoxidans.xml,62.063,1237,1638,2446
Acinetobacter_baumannii.xml,32.602,968,1433,1993
Acinetobacter_johnsonii.xml,37.415,760,1141,1600
Acinetobacter_junii.xml,28.275,732,1098,1560
...,...,...,...,...
Stutzerimonas_stutzeri.xml,36.313,1035,1455,2194
Veillonella_atypica.xml,39.532,525,1066,1546
Veillonella_parvula.xml,39.105,558,1087,1604
Xanthomonas_citri.xml,57.533,835,1529,2237


In [5]:
# Loading the edge and the sample data
environments = ["Hospital", "MetaSUB", "Office"]

# Loading nodes/ interactions informations
nodes_info, edges_info = {}, {}

for env in environments:
    net_edges = pd.read_csv(input_path + env + '_edges.tsv', sep = ",")
    nodes = list(set((list(net_edges['v1']) + list(net_edges['v2']))))
    nodes_info[env] = nodes
    edges_info[env] = net_edges

# Loading abundance information
sample_info = {}

for env in environments:
    sample_info[env] = pd.read_csv(sample_path + env + '_filtered_data.csv', sep = ",")
    sample_info[env].set_index('Species', inplace = True)

sample_info['Hospital'].head()
#edges_info['Hospital'].head()

Unnamed: 0_level_0,MEE001,MEE002,MEE003,MEE004,MEE005,MEE006,MEE007,MEE008,MEE009,MEE010,...,WEE376,WEE377,WEE378,WEE379,WEE380,WEE381,WEE382,WEE384,WEE385,WEE386
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abiotrophia defectiva,0.0,0.00317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.17724,0.0,0.0,0.00578,0.0,0.00129,0.00777,0.00518
Achromobacter xylosoxidans,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00215,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Acinetobacter baumannii,0.00258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01955,0.00427,0.01885,0.00785,0.0,0.02886,0.00532,0.00551,0.00291,0.01378
Acinetobacter johnsonii,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.00255,0.0,0.0,0.01293,0.00204,0.0,0.0
Acinetobacter junii,0.00281,0.0104,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00193,0.0,0.00468,0.00409,0.0,0.003,0.00311,0.00556,0.0,0.004


In [6]:
# Saving network information from the edge file to the networkx graph
network_info = {}

for env in environments:
    G = nx.Graph()

    # Add edges
    for _, row in edges_info[env].iterrows():
        G.add_edge(row['v1'], row['v2'], weight = row['Weight'])
    
    network_info[env] = G
    
# Network details
network_data = pd.DataFrame(columns=["Network", "Number of Nodes", 
                                    "Number of Edges"])

for graph_name, graph in network_info.items():

    # Extract network information
    num_nodes = len(graph.nodes)
    num_edges = len(graph.edges)

    # Create a DataFrame to store the results
    network_data.loc[len(network_data)] = [graph_name, num_nodes, num_edges]

network_data.set_index('Network', inplace=True)
network_data

Unnamed: 0_level_0,Number of Nodes,Number of Edges
Network,Unnamed: 1_level_1,Unnamed: 2_level_1
Hospital,83,506
MetaSUB,43,97
Office,14,30


In [7]:
# Overall available communities for random selection
all_orgs = set()
for df in sample_info.values():
    all_orgs.update(df.index)

all_orgs_list = list(all_orgs)

len(all_orgs_list)

114

In [11]:
# Set a seed for reproducibility
random.seed(42)

# Generate environment-specific communities
def generate_community(graph, env="Hospital", num_nodes=5, num_subgraphs=10, sample_info=sample_info):
    community_df = {"community id": [], "organism id": []}
    communities_generated = 0

    while communities_generated < num_subgraphs:
        node_list = list(graph.nodes())
        nodes = random.sample(node_list, num_nodes)  
        subgraph = graph.subgraph(nodes)  

        if nx.is_connected(subgraph):  
            organism_ids = list(subgraph.nodes())
            sample_subset = sample_info[env].loc[sample_info[env].index.isin(organism_ids)]
            samples = (sample_subset != 0).all()

            if samples.sum() > 0:
                # Generating actual communities
                community_id = f"{env}_{num_nodes}_{num_subgraphs}_{communities_generated+1}" 
                for org_id in organism_ids:
                    community_df["community id"].append(community_id)
                    community_df["organism id"].append(org_id)
                
                communities_generated += 1

    return pd.DataFrame(community_df)

# Generate random communities
def generate_rand_community(num_nodes=5, num_subgraphs=10, all_orgs=all_orgs_list):
    community_df = {"community id": [], "organism id": []}
    communities_generated = 0

    while communities_generated < num_subgraphs:
        nodes = random.sample(all_orgs, num_nodes)  

        # Generating random communities
        community_id = f"Random_{num_nodes}_{num_subgraphs}_{communities_generated+1}"
        for org_id in nodes:
            community_df["community id"].append(community_id)
            community_df["organism id"].append(org_id)

        communities_generated += 1

    return pd.DataFrame(community_df)

In [12]:
random.seed(42)

# Simulating communities
com_size = [2, 5, 8]
num_com = [10]

# Overall community information
community_info = pd.DataFrame({"community id": [], "organism id": []})

# Adding random community information
all_com_env = np.append(environments, ['Random'])

# Generate communities
for env in all_com_env:
    for size in com_size:
        for num in num_com:
            if env != 'Random':
                community_df = generate_community(network_info[env], env=env, num_nodes=size, num_subgraphs=num, sample_info=sample_info)
                community_info = pd.concat([community_info, community_df], ignore_index=True)
                print(f"Number of communities generated for {env} with {size} nodes and {num} communities: {len(community_df)}")
                #community_df.to_csv(output_path + env + f'_communities_{size}_{num}.csv', sep = ",", index = False)

            else:
                community_df = generate_rand_community(num_nodes=size, num_subgraphs=num, all_orgs=all_orgs_list)
                community_info = pd.concat([community_info, community_df], ignore_index=True)
                print(f"Number of random communities generated for {env} with {size} nodes and {num} communities: {len(community_df)}")

community_info['organism id'] = community_info['organism id'].str.replace(' ', '_')

# Changing the name according to the models
community_info['organism id'] = community_info['organism id'].str.replace('Kocuria_sp._UCD-OTCP', 'Kocuria_sp.')
community_info['organism id'] = community_info['organism id'].str.replace('Roseomonas_sp._B5', 'Roseomonas_sp.')
community_info['organism id'] = community_info['organism id'].str.replace('Dermacoccus_sp._Ellin185', 'Dermacoccus_sp.')

# Save the community
community_info.to_csv(output_path + 'community_info_2-5-8_10.csv', sep = "\t", index = False, header=False)

community_info

Number of communities generated for Hospital with 2 nodes and 10 communities: 20
Number of communities generated for Hospital with 5 nodes and 10 communities: 50
Number of communities generated for Hospital with 8 nodes and 10 communities: 80
Number of communities generated for MetaSUB with 2 nodes and 10 communities: 20
Number of communities generated for MetaSUB with 5 nodes and 10 communities: 50
Number of communities generated for MetaSUB with 8 nodes and 10 communities: 80
Number of communities generated for Office with 2 nodes and 10 communities: 20
Number of communities generated for Office with 5 nodes and 10 communities: 50
Number of communities generated for Office with 8 nodes and 10 communities: 80
Number of random communities generated for Random with 2 nodes and 10 communities: 20
Number of random communities generated for Random with 5 nodes and 10 communities: 50
Number of random communities generated for Random with 8 nodes and 10 communities: 80


Unnamed: 0,community id,organism id
0,Hospital_2_10_1,Klebsiella_pneumoniae
1,Hospital_2_10_1,Methylobacterium_radiotolerans
2,Hospital_2_10_2,Serratia_marcescens
3,Hospital_2_10_2,Micrococcus_luteus
4,Hospital_2_10_3,Corynebacterium_tuberculostearicum
...,...,...
595,Random_8_10_10,Massilia_timonae
596,Random_8_10_10,Mycolicibacterium_fortuitum
597,Random_8_10_10,Klebsiella_pneumoniae
598,Random_8_10_10,Enterobacter_cloacae


In [13]:
# Check model names
model_files = glob.glob(model_path + '*.xml')
model_files = [os.path.basename(file) for file in model_files]
model_files = [file.replace('.xml', '') for file in model_files]

model_files

mismatch = list(set(list(community_info['organism id'])) - set(model_files))
mismatch

[]

In [17]:
# Write a bash script to run smetana
with open(output_path + 'run_smetana_community_info_2-5-8_10.sh', 'w') as f:
    f.write("#!/bin/bash\n")
    f.write("smetana /mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Modeling/Models/*.xml \\\n")
    f.write("    -c /mnt/Local_Disk_1/2_Hospital_Microbiome/Data/5_Model_simulations/community_info_2-5-8_10.csv \\\n")
    f.write("    -o /mnt/Local_Disk_1/2_Hospital_Microbiome/Data/5_Model_simulations/Smetana_output_community_info_2-5-8_10.csv \\\n")
    f.write("    --solver cplex --flavor bigg --molweight")

In [18]:
# Run the bash script
!chmod +x {output_path + 'run_smetana_community_info_2-5-8_10.sh'}
!bash {output_path + 'run_smetana_community_info_2-5-8_10.sh'}

  warn('MRO: Failed to find a valid solution for: ' + org_id)
