In [3]:
# Import necessary packages
import glob
import os
import pandas as pd
import reframed
from Bio import Phylo
from Bio.Phylo.TreeConstruction import DistanceMatrix
from io import StringIO
from scipy.spatial.distance import pdist, squareform

# Set the directory path
figure_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Figures/'
input_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Modeling/Annotated_genomes/'
output_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Output_data/'

In [4]:
# Load genome to organism mapping
genome_to_organism = pd.read_csv(output_path + 'accession_details.csv', index_col=0)

# Creating mapping of genome to organism
mapping = genome_to_organism['assembly_accession'].to_dict()
mapping_inv = {v: k for k, v in mapping.items()}

In [19]:
# Classify the genomes using COG classifier (Alternatively command-line can be used)
#os.chdir(input_path)

#for genomes in glob.glob('*.faa'):
#    outfile = output_path + 'COG_classifier/' + genomes[:-4] 
#    cogclassifier.run(genomes, outfile)

In [5]:
# Set the directory path
os.chdir(output_path + 'COG_classifier/')

# Initiate a empty directory for cog data
cog_dict = {}

# Read the COG files
for genome in os.listdir(output_path + 'COG_classifier/'):
    if os.path.isdir(output_path + 'COG_classifier/' + genome):
        cogfile = pd.read_csv(output_path + 'COG_classifier/' + genome + '/classifier_result.tsv', sep='\t')
        
        # Filter based on identity > 30
        present_cogs = cogfile.loc[cogfile['IDENTITY'] > 30, 'COG_ID']
        
        # Add data to the dictionary, marking presence (1) or absence (0)
        for cog in present_cogs:
            if cog not in cog_dict:
                cog_dict[cog] = {}
            cog_dict[cog][genome] = 1

# Fill absent COGs with 0 for genomes where they are missing
all_genomes = [genome for genome in os.listdir(output_path + 'COG_classifier/') if os.path.isdir(output_path + 'COG_classifier/' + genome)]
for cog in cog_dict:
    for genome in all_genomes:
        if genome not in cog_dict[cog]:
            cog_dict[cog][genome] = 0

# Convert the dictionary to a pandas DataFrame
cog_df = pd.DataFrame(cog_dict).T  

# Fill NaN values with 0
cog_df = cog_df.fillna(0)

# Sort the DataFrame
cog_df = cog_df.sort_index(axis=0).sort_index(axis=1)

# Display the DataFrame
cog_df

Unnamed: 0,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1,...,GCF_900099625.1,GCF_900167455.1,GCF_900169275.1,GCF_900176155.1,GCF_900445095.1,GCF_900447555.1,GCF_900453805.1,GCF_900459355.1,GCF_900637975.1,GCF_901542405.1
COG0001,0,0,1,1,1,1,0,0,1,0,...,0,1,1,1,0,1,1,0,1,1
COG0002,1,1,1,0,0,1,1,0,1,1,...,1,1,1,1,1,1,1,0,1,1
COG0003,0,0,1,1,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
COG0004,1,1,1,0,0,1,1,0,1,1,...,1,1,1,0,0,0,0,0,1,1
COG0005,1,1,0,1,1,1,1,1,1,1,...,0,1,1,0,0,0,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
COG5942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
COG5943,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COG5944,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
COG5945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Calculate the Jaccard distance between the genomes using the COG data
cog_distance = pdist(cog_df.T, metric='jaccard')

# Convert the distance to matrix
cog_matrix = squareform(cog_distance)

# Create a DataFrame for the distance matrix with genome names as labels
cog_distance_df = pd.DataFrame(cog_matrix, index=cog_df.columns, columns=cog_df.columns)
cog_distance_df = cog_distance_df.sort_index(axis=0).sort_index(axis=1)
cog_distance_df = cog_distance_df.round(4)
#cog_distance_df = cog_distance_df.rename(columns=mapping_inv, index=mapping_inv)
# Save the distance matrix to a file
cog_distance_df.to_csv(output_path + 'distances/1_functional.tsv', sep='\t')

# Display the DataFrame
cog_distance_df

Unnamed: 0,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1,...,GCF_900099625.1,GCF_900167455.1,GCF_900169275.1,GCF_900176155.1,GCF_900445095.1,GCF_900447555.1,GCF_900453805.1,GCF_900459355.1,GCF_900637975.1,GCF_901542405.1
GCF_000019725.1,0.0000,0.2078,0.6516,0.6056,0.6612,0.6507,0.2194,0.7032,0.5773,0.6998,...,0.6973,0.3717,0.5696,0.6328,0.7003,0.6495,0.5937,0.7247,0.6286,0.5279
GCF_000019945.1,0.2078,0.0000,0.6450,0.6067,0.6615,0.6473,0.1162,0.6929,0.5821,0.6894,...,0.6910,0.3905,0.5857,0.6212,0.6916,0.6455,0.5860,0.7171,0.6279,0.5404
GCF_000023145.1,0.6516,0.6450,0.0000,0.4675,0.5666,0.3217,0.6406,0.6284,0.4338,0.6265,...,0.5512,0.6361,0.4678,0.3419,0.5607,0.3384,0.6090,0.5931,0.4475,0.4524
GCF_000023925.1,0.6056,0.6067,0.4675,0.0000,0.6163,0.4411,0.5949,0.6506,0.3516,0.6404,...,0.6016,0.5952,0.3768,0.4490,0.5970,0.4601,0.6163,0.6227,0.4489,0.4413
GCF_000024945.1,0.6612,0.6615,0.5666,0.6163,0.0000,0.5866,0.6599,0.5947,0.5929,0.5329,...,0.5543,0.6574,0.6162,0.5703,0.5626,0.5671,0.5945,0.5607,0.6021,0.6036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCF_900447555.1,0.6495,0.6455,0.3384,0.4601,0.5671,0.3464,0.6358,0.6263,0.4464,0.6162,...,0.5699,0.6439,0.4742,0.3288,0.5839,0.0000,0.6047,0.5993,0.4319,0.4508
GCF_900453805.1,0.5937,0.5860,0.6090,0.6163,0.5945,0.5965,0.5814,0.6051,0.6027,0.6483,...,0.6409,0.5890,0.6155,0.5879,0.6414,0.6047,0.0000,0.6509,0.6032,0.6167
GCF_900459355.1,0.7247,0.7171,0.5931,0.6227,0.5607,0.6112,0.7128,0.6224,0.6231,0.5325,...,0.3593,0.7049,0.6361,0.6157,0.3961,0.5993,0.6509,0.0000,0.5792,0.6444
GCF_900637975.1,0.6286,0.6279,0.4475,0.4489,0.6021,0.4504,0.6217,0.6259,0.4269,0.6130,...,0.5452,0.6199,0.4475,0.4128,0.5771,0.4319,0.6032,0.5792,0.0000,0.4579


In [22]:
# Calculating the mash distance in the command line
# mash sketch -o genome_sketch ./../../Genomes/*.fa
# mash dist genome_sketch.msh genome_sketch.msh > mash_distance.tab

In [7]:
# load the MASH distance file
mash_distance = pd.read_csv(output_path + 'mash_distance/' + 'mash_distance.tab', sep='\t', header=None)
mash_distance.columns = ['Reference', 'Query', 'mash_distance', 'p-value', 'shared_hashes']
mash_distance

# Mash matrix
mash_matrix = mash_distance.pivot(index='Reference', columns='Query', values='mash_distance')
mash_matrix.index = mash_matrix.index.str.replace(r'./../../Genomes/(GCF_\d+\.\d+)\.fa', r'\1', regex=True)
mash_matrix.columns = mash_matrix.columns.str.replace(r'./../../Genomes/(GCF_\d+\.\d+)\.fa', r'\1', regex=True)

mash_matrix = mash_matrix.sort_index(axis=0).sort_index(axis=1)
mash_matrix = mash_matrix.round(4)
#mash_matrix = mash_matrix.rename(columns=mapping_inv, index=mapping_inv)

# Save the distance matrix to a file
mash_matrix.to_csv(output_path + 'distances/2_mash.tsv', sep='\t')

# Display the DataFrame
mash_matrix

Query,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1,...,GCF_900099625.1,GCF_900167455.1,GCF_900169275.1,GCF_900176155.1,GCF_900445095.1,GCF_900447555.1,GCF_900453805.1,GCF_900459355.1,GCF_900637975.1,GCF_901542405.1
Reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCF_000019725.1,0.0000,0.1782,1.0,1.000,1.000,1.000,0.1823,1.0,1.000,1.000,...,1.0000,1.000,1.000,1.000,1.000,1.0,1.0,1.000,1.000,1.000
GCF_000019945.1,0.1782,0.0000,1.0,1.000,1.000,1.000,0.1045,1.0,0.296,1.000,...,1.0000,0.263,1.000,1.000,1.000,1.0,1.0,1.000,1.000,1.000
GCF_000023145.1,1.0000,1.0000,0.0,1.000,1.000,0.296,1.0000,1.0,1.000,1.000,...,1.0000,1.000,0.296,0.296,1.000,1.0,1.0,1.000,1.000,1.000
GCF_000023925.1,1.0000,1.0000,1.0,0.000,1.000,0.296,0.2960,1.0,1.000,1.000,...,1.0000,1.000,1.000,0.296,1.000,1.0,1.0,1.000,0.263,0.296
GCF_000024945.1,1.0000,1.0000,1.0,1.000,0.000,1.000,1.0000,1.0,1.000,0.296,...,0.2960,1.000,1.000,1.000,0.296,1.0,1.0,0.263,1.000,0.296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCF_900447555.1,1.0000,1.0000,1.0,1.000,1.000,1.000,1.0000,1.0,1.000,1.000,...,1.0000,1.000,1.000,1.000,1.000,0.0,1.0,1.000,1.000,1.000
GCF_900453805.1,1.0000,1.0000,1.0,1.000,1.000,1.000,1.0000,1.0,1.000,1.000,...,1.0000,1.000,1.000,1.000,1.000,1.0,0.0,1.000,1.000,1.000
GCF_900459355.1,1.0000,1.0000,1.0,1.000,0.263,1.000,1.0000,1.0,1.000,0.296,...,0.2301,1.000,1.000,1.000,0.263,1.0,1.0,0.000,1.000,0.296
GCF_900637975.1,1.0000,1.0000,1.0,0.263,1.000,0.296,1.0000,1.0,1.000,1.000,...,1.0000,1.000,1.000,1.000,1.000,1.0,1.0,1.000,0.000,1.000


In [8]:
# Calculating the phylogenetic distance and tree using GToTree in the command line
# GToTree -f ./../../Genomes/fasta_files.txt -H Bacteria -t -L Species,Strain -j 10 -o Hosp_microbiome

In [9]:
# Load the Newick file from local storage
newick_file = output_path + '/phylogenetic_tree/Hosp_microbiome/' + 'Hosp_microbiome.tre'

# Read the Newick tree
tree = Phylo.read(newick_file, 'newick')

# Function to calculate distances between all pairs of taxa
def compute_distance_matrix(tree):
    terminals = tree.get_terminals()
    matrix = []
    for i in range(len(terminals)):
        row = []
        for j in range(len(terminals)):
            distance = tree.distance(terminals[i], terminals[j])
            row.append(distance)
        matrix.append(row)
    
    # Create a pandas DataFrame
    labels = [term.name for term in terminals]
    df = pd.DataFrame(matrix, index=labels, columns=labels)
    return df

# Generate the distance matrix
distance_matrix_df = compute_distance_matrix(tree)

distance_matrix_df = distance_matrix_df.sort_index(axis=0).sort_index(axis=1)
distance_matrix_df = distance_matrix_df.round(4)
#distance_matrix_df = distance_matrix_df.rename(columns=mapping_inv, index=mapping_inv)

# Save the distance matrix to a CSV file
distance_matrix_df.to_csv(output_path + 'distances/3_phylogenetic.tsv', sep='\t')

# Display the distance matrix
distance_matrix_df


Unnamed: 0,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1,...,GCF_900099625.1,GCF_900167455.1,GCF_900169275.1,GCF_900176155.1,GCF_900445095.1,GCF_900447555.1,GCF_900453805.1,GCF_900459355.1,GCF_900637975.1,GCF_901542405.1
GCF_000019725.1,0.0000,0.1192,1.5036,1.5570,1.3167,1.5327,0.1208,1.6713,1.5205,1.4402,...,1.5801,0.6085,1.5640,1.5702,1.4881,1.5142,1.1859,1.5812,1.5057,1.4947
GCF_000019945.1,0.1192,0.0000,1.5106,1.5640,1.3237,1.5398,0.0300,1.6784,1.5275,1.4473,...,1.5871,0.6155,1.5710,1.5772,1.4952,1.5212,1.1929,1.5882,1.5127,1.5018
GCF_000023145.1,1.5036,1.5106,0.0000,0.8307,1.2721,0.3165,1.5122,1.7149,0.7942,1.3957,...,1.5355,1.5866,0.8377,0.4146,1.4435,0.3585,1.5165,1.5366,0.7794,0.5168
GCF_000023925.1,1.5570,1.5640,0.8307,0.0000,1.3256,0.8599,1.5656,1.7683,0.4279,1.4491,...,1.5889,1.6400,0.6382,0.8973,1.4970,0.8413,1.5699,1.5900,0.6426,0.8219
GCF_000024945.1,1.3167,1.3237,1.2721,1.3256,0.0000,1.3013,1.3253,1.5280,1.2891,1.0730,...,1.2129,1.3998,1.3326,1.3388,1.1209,1.2827,1.3296,1.2140,1.2743,1.2633
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GCF_900447555.1,1.5142,1.5212,0.3585,0.8413,1.2827,0.3877,1.5228,1.7255,0.8048,1.4063,...,1.5461,1.5972,0.8483,0.3607,1.4541,0.0000,1.5270,1.5472,0.7900,0.5274
GCF_900453805.1,1.1859,1.1929,1.5165,1.5699,1.3296,1.5456,1.1945,1.6842,1.5334,1.4531,...,1.5930,1.2689,1.5769,1.5831,1.5010,1.5270,0.0000,1.5941,1.5186,1.5076
GCF_900459355.1,1.5812,1.5882,1.5366,1.5900,1.2140,1.5658,1.5898,1.7925,1.5536,1.2232,...,0.3762,1.6642,1.5970,1.6033,0.7122,1.5472,1.5941,0.0000,1.5387,1.5278
GCF_900637975.1,1.5057,1.5127,0.7794,0.6426,1.2743,0.8086,1.5143,1.7170,0.6061,1.3978,...,1.5376,1.5888,0.6496,0.8460,1.4457,0.7900,1.5186,1.5387,0.0000,0.7706


In [10]:
# Metabolic distance matrix

# Load the metabolic models and create presence absence matrix for the reactions
model_path = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Modeling/Models/'

# Load the metabolic models
reaction_dict = {}

models = glob.glob(model_path + '*.xml')

for model in models:
    model_id = os.path.basename(model).replace('.xml', '')
    model = reframed.load_cbmodel(model)
    reactions = model.reactions
    reaction_dict[model_id] = {}
    for reaction in reactions:
        reaction_dict[model_id][reaction] = 1

# Fill absent reactions with 0 for models where they are missing
all_models = [model for model in reaction_dict]
for model in reaction_dict:
    for reaction in reaction_dict[model]:
        for model in all_models:
            if reaction not in reaction_dict[model]:
                reaction_dict[model][reaction] = 0

# Convert the dictionary to a pandas DataFrame
reaction_df = pd.DataFrame(reaction_dict).T

# Sort the DataFrame
reaction_df = reaction_df.sort_index(axis=0).sort_index(axis=1)

# Display the DataFrame
reaction_df


Unnamed: 0,Growth,R_12DGR120tipp,R_12DGR140tipp,R_12DGR141tipp,R_12DGR160tipp,R_12DGR161tipp,R_12DGR180tipp,R_12DGR181tipp,R_12PPDRDH,R_12PPDRte,...,R_sink_5drib_c,R_sink_aacald_c,R_sink_amob_c,R_sink_hemeO_c,R_sink_hmfurn_c,R_sink_lipopb_c,R_sink_mobd_c,R_sink_mththf_c,R_sink_sheme_c,R_unkFol
Abiotrophia_defectiva,1,0,0,0,0,0,0,0,1,1,...,1,0,0,0,0,1,0,1,0,0
Achromobacter_xylosoxidans,1,1,1,1,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
Acinetobacter_baumannii,1,1,1,1,1,1,1,1,0,0,...,0,0,1,1,0,0,1,0,0,0
Acinetobacter_johnsonii,1,0,1,0,1,0,0,1,0,0,...,0,0,1,1,0,0,1,0,0,0
Acinetobacter_junii,1,0,0,0,0,0,0,1,0,1,...,0,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stutzerimonas_stutzeri,1,1,1,1,1,1,1,1,0,0,...,0,0,0,1,0,0,1,0,1,0
Veillonella_atypica,1,1,1,1,1,1,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0
Veillonella_parvula,1,1,1,1,1,1,1,1,1,1,...,0,0,1,0,0,0,1,0,0,0
Xanthomonas_citri,1,1,1,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Calculate the Jaccard distance between the models using the reaction data
reaction_distance = pdist(reaction_df, metric='jaccard')

# Convert the distance to matrix
reaction_matrix = squareform(reaction_distance)

# Create a DataFrame for the distance matrix with model names as labels
reaction_distance_df = pd.DataFrame(reaction_matrix, index=reaction_df.index, columns=reaction_df.index)
reaction_distance_df = reaction_distance_df.sort_index(axis=0).sort_index(axis=1)
reaction_distance_df = reaction_distance_df.round(4)
#reaction_distance_df = reaction_distance_df.rename(columns=mapping_inv, index=mapping_inv)

# Save the distance matrix to a file
reaction_distance_df.to_csv(output_path + 'distances/4_metabolic.tsv', sep='\t')

# Display the DataFrame
reaction_distance_df

Unnamed: 0,Abiotrophia_defectiva,Achromobacter_xylosoxidans,Acinetobacter_baumannii,Acinetobacter_johnsonii,Acinetobacter_junii,Acinetobacter_lwoffii,Acinetobacter_parvus,Acinetobacter_schindleri,Acinetobacter_ursingii,Actinomyces_oris,...,Streptococcus_gordonii,Streptococcus_infantis,Streptococcus_parasanguinis,Streptococcus_salivarius,Streptococcus_sanguinis,Stutzerimonas_stutzeri,Veillonella_atypica,Veillonella_parvula,Xanthomonas_citri,[Ruminococcus]_torques
Abiotrophia_defectiva,0.0000,0.7977,0.7599,0.7721,0.7571,0.7639,0.7682,0.7740,0.7679,0.6162,...,0.5128,0.5617,0.5104,0.5270,0.5435,0.7796,0.6856,0.6880,0.7632,0.6255
Achromobacter_xylosoxidans,0.7977,0.0000,0.5430,0.5843,0.5915,0.5706,0.6371,0.5719,0.5713,0.6805,...,0.7856,0.7605,0.7588,0.7723,0.7612,0.4228,0.6589,0.6543,0.4436,0.7056
Acinetobacter_baumannii,0.7599,0.5430,0.0000,0.3904,0.3836,0.3920,0.4471,0.4182,0.3800,0.6695,...,0.7302,0.7120,0.7085,0.7420,0.7242,0.5514,0.5762,0.5741,0.5179,0.6876
Acinetobacter_johnsonii,0.7721,0.5843,0.3904,0.0000,0.2631,0.2653,0.3081,0.2964,0.2798,0.6611,...,0.7263,0.7262,0.7087,0.7077,0.7087,0.5702,0.6305,0.6373,0.5583,0.6997
Acinetobacter_junii,0.7571,0.5915,0.3836,0.2631,0.0000,0.2934,0.3008,0.3275,0.2809,0.6480,...,0.7092,0.7181,0.6980,0.6929,0.7007,0.5564,0.6121,0.6129,0.5664,0.6882
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Stutzerimonas_stutzeri,0.7796,0.4228,0.5514,0.5702,0.5564,0.5548,0.5932,0.5622,0.5438,0.6533,...,0.7679,0.7278,0.7391,0.7472,0.7491,0.0000,0.6504,0.6417,0.4324,0.6850
Veillonella_atypica,0.6856,0.6589,0.5762,0.6305,0.6121,0.6399,0.6313,0.6451,0.6312,0.6618,...,0.6505,0.6311,0.6379,0.6628,0.6565,0.6504,0.0000,0.1684,0.6413,0.5653
Veillonella_parvula,0.6880,0.6543,0.5741,0.6373,0.6129,0.6389,0.6239,0.6532,0.6362,0.6587,...,0.6512,0.6473,0.6434,0.6645,0.6512,0.6417,0.1684,0.0000,0.6337,0.5811
Xanthomonas_citri,0.7632,0.4436,0.5179,0.5583,0.5664,0.5614,0.5941,0.5714,0.5430,0.6373,...,0.7482,0.7355,0.7482,0.7548,0.7421,0.4324,0.6413,0.6337,0.0000,0.7085
