In [None]:
# ------------------------------------------
# Script: 5 (Distances.ipynb)
# ------------------------------------------
# Author: Pratyay Sengupta
# ------------------------------------------

In [2]:
# Import required libraries
import os
import glob
import pandas as pd
import seaborn as sns
from io import StringIO
from Bio import Phylo
from scipy.spatial.distance import pdist, squareform
from reframed import load_cbmodel
from itertools import combinations


# Set data directories
input_dir = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Modeling/Annotated_genomes/'
output_dir = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Output_data/'
model_dir = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Modeling/Models/'
figures_dir = '/mnt/Local_Disk_1/2_Hospital_Microbiome/Data/Figures/'
dist_path = os.path.join(output_dir, 'distances')
os.makedirs(dist_path, exist_ok=True)

In [4]:
# ------------------------------
# Genome → Organism Mapping
# ------------------------------
genome_to_organism = pd.read_csv(os.path.join(output_dir, '3_accession_details.csv'), index_col=0)
mapping = genome_to_organism['assembly_accession'].to_dict()
mapping_inv = {v: k for k, v in mapping.items()}


In [19]:
mapping_inv

{'GCF_037041345.1': 'Abiotrophia defectiva',
 'GCF_022870085.1': 'Achromobacter xylosoxidans',
 'GCF_000369385.1': 'Acinetobacter baumannii',
 'GCF_016027055.1': 'Acinetobacter johnsonii',
 'GCF_000430225.1': 'Acinetobacter junii',
 'GCF_019048305.1': 'Acinetobacter lwoffii',
 'GCF_000248155.1': 'Acinetobacter parvus',
 'GCF_000368625.1': 'Acinetobacter schindleri',
 'GCF_000248135.1': 'Acinetobacter ursingii',
 'GCF_016127955.1': 'Actinomyces oris',
 'GCF_900637975.1': 'Actinomyces viscosus',
 'GCF_900445095.1': 'Aerococcus viridans',
 'GCF_022453665.1': 'Bacteroides thetaiotaomicron',
 'GCF_016117815.1': 'Bacteroides uniformis',
 'GCF_000741415.1': 'Bifidobacterium adolescentis',
 'GCF_000158015.1': 'Bifidobacterium breve',
 'GCF_017132775.1': 'Bifidobacterium longum',
 'GCF_016907455.1': 'Brachybacterium muris',
 'GCF_003994255.1': 'Brachybacterium paraconglomeratum',
 'GCF_900169275.1': 'Brevibacterium casei',
 'GCF_001584405.1': 'Brevibacterium ravenspurgense',
 'GCF_000204035.1':

In [None]:
# Classify the genomes using COG classifier (Alternatively command-line can be used)
#os.chdir(input_path)

#for genomes in glob.glob('*.faa'):
#    outfile = output_path + 'COG_classifier/' + genomes[:-4] 
#    cogclassifier.run(genomes, outfile)

In [11]:
# ------------------------------
# Functional Distance from COG Annotations
# ------------------------------
cog_dir = os.path.join(output_dir, 'COG_classifier')
cog_dict = {}
genomes = [g for g in os.listdir(cog_dir) if os.path.isdir(os.path.join(cog_dir, g))]

for genome in genomes:
    cog_file = os.path.join(cog_dir, genome, 'classifier_result.tsv')
    if os.path.exists(cog_file):
        cog_data = pd.read_csv(cog_file, sep='\t')
        filtered = cog_data[cog_data['IDENTITY'] > 30]['COG_ID']
        for cog in filtered:
            cog_dict.setdefault(cog, {})[genome] = 1

# Fill absent values with 0
for cog in cog_dict:
    for genome in genomes:
        cog_dict[cog].setdefault(genome, 0)

cog_df = pd.DataFrame(cog_dict).T.fillna(0).sort_index(axis=0).sort_index(axis=1)

# Jaccard distance
cog_dist = pdist(cog_df.T, metric='jaccard')
cog_matrix = squareform(cog_dist)
cog_distance_df = pd.DataFrame(cog_matrix, index=cog_df.columns, columns=cog_df.columns).round(4)

cog_distance_df.to_csv(os.path.join(dist_path, '1_functional.tsv'), sep='\t')

# Display the distance matrix
cog_distance_df.iloc[0:10, 0:10]

Unnamed: 0,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1
GCF_000019725.1,0.0,0.2078,0.6516,0.6056,0.6612,0.6507,0.2194,0.7032,0.5773,0.6998
GCF_000019945.1,0.2078,0.0,0.645,0.6067,0.6615,0.6473,0.1162,0.6929,0.5821,0.6894
GCF_000023145.1,0.6516,0.645,0.0,0.4675,0.5666,0.3217,0.6406,0.6284,0.4338,0.6265
GCF_000023925.1,0.6056,0.6067,0.4675,0.0,0.6163,0.4411,0.5949,0.6506,0.3516,0.6404
GCF_000024945.1,0.6612,0.6615,0.5666,0.6163,0.0,0.5866,0.6599,0.5947,0.5929,0.5329
GCF_000069945.1,0.6507,0.6473,0.3217,0.4411,0.5866,0.0,0.6409,0.6163,0.4297,0.6164
GCF_000083545.1,0.2194,0.1162,0.6406,0.5949,0.6599,0.6409,0.0,0.6888,0.5748,0.6898
GCF_000144405.1,0.7032,0.6929,0.6284,0.6506,0.5947,0.6163,0.6888,0.0,0.6414,0.6144
GCF_000152185.1,0.5773,0.5821,0.4338,0.3516,0.5929,0.4297,0.5748,0.6414,0.0,0.6272
GCF_000153925.1,0.6998,0.6894,0.6265,0.6404,0.5329,0.6164,0.6898,0.6144,0.6272,0.0


In [None]:
# Calculating the mash distance in the command line
# mash sketch -o genome_sketch ./../../Genomes/*.fa
# mash dist genome_sketch.msh genome_sketch.msh > mash_distance.tab

In [13]:
# ------------------------------
# MASH Distance
# ------------------------------
mash_file = os.path.join(output_dir, 'mash_distance', 'mash_distance.tab')
mash_df = pd.read_csv(mash_file, sep='\t', header=None,
                      names=['Reference', 'Query', 'mash_distance', 'p-value', 'shared_hashes'])

# Clean labels
clean_label = lambda x: os.path.basename(x).replace('.fa', '')
mash_df['Reference'] = mash_df['Reference'].apply(clean_label)
mash_df['Query'] = mash_df['Query'].apply(clean_label)

mash_matrix = mash_df.pivot(index='Reference', columns='Query', values='mash_distance').sort_index().sort_index(axis=1).round(4)
mash_matrix.to_csv(os.path.join(dist_path, '2_mash.tsv'), sep='\t')

# Display the distance matrix
mash_matrix.iloc[0:10, 0:10]

Query,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1
Reference,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GCF_000019725.1,0.0,0.1782,1.0,1.0,1.0,1.0,0.1823,1.0,1.0,1.0
GCF_000019945.1,0.1782,0.0,1.0,1.0,1.0,1.0,0.1045,1.0,0.296,1.0
GCF_000023145.1,1.0,1.0,0.0,1.0,1.0,0.296,1.0,1.0,1.0,1.0
GCF_000023925.1,1.0,1.0,1.0,0.0,1.0,0.296,0.296,1.0,1.0,1.0
GCF_000024945.1,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.296
GCF_000069945.1,1.0,1.0,0.296,0.296,1.0,0.0,1.0,1.0,1.0,1.0
GCF_000083545.1,0.1823,0.1045,1.0,0.296,1.0,1.0,0.0,1.0,1.0,1.0
GCF_000144405.1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
GCF_000152185.1,1.0,0.296,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0
GCF_000153925.1,1.0,1.0,1.0,1.0,0.296,1.0,1.0,1.0,1.0,0.0


In [None]:
# Calculating the phylogenetic distance and tree using GToTree in the command line
# GToTree -f ./../../Genomes/fasta_files.txt -H Bacteria -t -L Species,Strain -j 10 -o Hosp_microbiome

In [15]:
# ------------------------------
# Phylogenetic Distance from Tree
# ------------------------------
newick_file = os.path.join(output_dir, 'phylogenetic_tree', 'Hosp_microbiome', 'Hosp_microbiome.tre')
tree = Phylo.read(newick_file, 'newick')

def compute_phylo_matrix(tree):
    terms = tree.get_terminals()
    labels = [term.name for term in terms]
    matrix = [[tree.distance(i, j) for j in terms] for i in terms]
    return pd.DataFrame(matrix, index=labels, columns=labels)

phylo_df = compute_phylo_matrix(tree).sort_index().sort_index(axis=1).round(4)
phylo_df.to_csv(os.path.join(dist_path, '3_phylogenetic.tsv'), sep='\t')

# Display the distance matrix
phylo_df.iloc[0:10, 0:10]

Unnamed: 0,GCF_000019725.1,GCF_000019945.1,GCF_000023145.1,GCF_000023925.1,GCF_000024945.1,GCF_000069945.1,GCF_000083545.1,GCF_000144405.1,GCF_000152185.1,GCF_000153925.1
GCF_000019725.1,0.0,0.1192,1.5036,1.557,1.3167,1.5327,0.1208,1.6713,1.5205,1.4402
GCF_000019945.1,0.1192,0.0,1.5106,1.564,1.3237,1.5398,0.03,1.6784,1.5275,1.4473
GCF_000023145.1,1.5036,1.5106,0.0,0.8307,1.2721,0.3165,1.5122,1.7149,0.7942,1.3957
GCF_000023925.1,1.557,1.564,0.8307,0.0,1.3256,0.8599,1.5656,1.7683,0.4279,1.4491
GCF_000024945.1,1.3167,1.3237,1.2721,1.3256,0.0,1.3013,1.3253,1.528,1.2891,1.073
GCF_000069945.1,1.5327,1.5398,0.3165,0.8599,1.3013,0.0,1.5414,1.7441,0.8234,1.4248
GCF_000083545.1,0.1208,0.03,1.5122,1.5656,1.3253,1.5414,0.0,1.68,1.5292,1.4489
GCF_000144405.1,1.6713,1.6784,1.7149,1.7683,1.528,1.7441,1.68,0.0,1.7318,1.6516
GCF_000152185.1,1.5205,1.5275,0.7942,0.4279,1.2891,0.8234,1.5292,1.7318,0.0,1.4126
GCF_000153925.1,1.4402,1.4473,1.3957,1.4491,1.073,1.4248,1.4489,1.6516,1.4126,0.0


In [27]:
# ------------------------------
# Metabolic Distance (Presence/Absence of Reactions)
# ------------------------------
reaction_dict = {}
models = glob.glob(os.path.join(model_dir, '*.xml'))

for path in models:
    model_id = os.path.basename(path).replace('.xml', '')
    model = load_cbmodel(path)
    for rxn in model.reactions:
        reaction_dict.setdefault(model_id, {})[rxn] = 1

# Fill in 0s for absent reactions
all_reactions = set(r for model in reaction_dict.values() for r in model)
for model_id in reaction_dict:
    for rxn in all_reactions:
        reaction_dict[model_id].setdefault(rxn, 0)

reaction_df = pd.DataFrame(reaction_dict).T.fillna(0).sort_index().sort_index(axis=1)

# Jaccard distance
reaction_dist = pdist(reaction_df, metric='jaccard')
reaction_matrix = squareform(reaction_dist)
reaction_distance_df = pd.DataFrame(reaction_matrix, index=reaction_df.index, columns=reaction_df.index).round(4)

# Invert the dictionary: name → accession
name_to_accession = {v.replace(' ', '_'): k for k, v in mapping_inv.items()}

# Apply specific replacements to keys
name_to_accession = {
    key.replace('Kocuria_sp._UCD-OTCP', 'Kocuria_sp.')
        .replace('Roseomonas_sp._B5', 'Roseomonas_sp.')
        .replace('Dermacoccus_sp._Ellin185', 'Dermacoccus_sp.'): val
    for key, val in name_to_accession.items()
}

reaction_distance_df = reaction_distance_df.rename(columns=name_to_accession, index=name_to_accession)

reaction_distance_df.to_csv(os.path.join(dist_path, '4_metabolic.tsv'), sep='\t')

# Display the distance matrix
reaction_distance_df.iloc[0:10, 0:10]

Unnamed: 0,GCF_037041345.1,GCF_022870085.1,GCF_000369385.1,GCF_016027055.1,GCF_000430225.1,GCF_019048305.1,GCF_000248155.1,GCF_000368625.1,GCF_000248135.1,GCF_016127955.1
GCF_037041345.1,0.0,0.7977,0.7599,0.7721,0.7571,0.7639,0.7682,0.774,0.7679,0.6162
GCF_022870085.1,0.7977,0.0,0.543,0.5843,0.5915,0.5706,0.6371,0.5719,0.5713,0.6805
GCF_000369385.1,0.7599,0.543,0.0,0.3904,0.3836,0.392,0.4471,0.4182,0.38,0.6695
GCF_016027055.1,0.7721,0.5843,0.3904,0.0,0.2631,0.2653,0.3081,0.2964,0.2798,0.6611
GCF_000430225.1,0.7571,0.5915,0.3836,0.2631,0.0,0.2934,0.3008,0.3275,0.2809,0.648
GCF_019048305.1,0.7639,0.5706,0.392,0.2653,0.2934,0.0,0.3254,0.2299,0.2946,0.6533
GCF_000248155.1,0.7682,0.6371,0.4471,0.3081,0.3008,0.3254,0.0,0.3524,0.3282,0.6512
GCF_000368625.1,0.774,0.5719,0.4182,0.2964,0.3275,0.2299,0.3524,0.0,0.2741,0.6795
GCF_000248135.1,0.7679,0.5713,0.38,0.2798,0.2809,0.2946,0.3282,0.2741,0.0,0.6595
GCF_016127955.1,0.6162,0.6805,0.6695,0.6611,0.648,0.6533,0.6512,0.6795,0.6595,0.0
