In [3]:
# Import necessary packages
import glob
import os
import pandas as pd

# Set the directory path
figure_path = '/mnt/Local_Disk_1/Hospital_Microbiome/Data/Figures/'
input_path = '/mnt/Local_Disk_1/Hospital_Microbiome/Data/Input_data/'
model_path = '/mnt/Local_Disk_1/Hospital_Microbiome/Data/Modeling/Models/'
output_path = '/mnt/Local_Disk_1/Hospital_Microbiome/Data/Output_data/'

In [4]:
# Load the gram staining information file
staining_info = pd.read_csv(input_path + 'gram_staining.csv', sep=',', index_col="organism_name")
staining_info

# Load the genome information file
genome_info = pd.read_csv(output_path + 'genome_details.csv', sep=',', index_col="organism_name")
genome_info = genome_info[["assembly_accession"]]

# Matching index
genome_info.index = genome_info.index.to_series().apply(lambda x: ' '.join(x.split()[:2]))

# Merge the two dataframes
genome_info = genome_info.merge(staining_info, left_index=True, right_index=True)

genome_info

Unnamed: 0_level_0,assembly_accession,gram_staining
organism_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Abiotrophia defectiva,GCF_037041345.1,positive
Achromobacter xylosoxidans,GCF_022870085.1,negative
Acinetobacter baumannii,GCF_000369385.1,negative
Acinetobacter johnsonii,GCF_016027055.1,negative
Acinetobacter junii,GCF_000430225.1,negative
...,...,...
Stutzerimonas stutzeri,GCF_000219605.1,negative
Veillonella atypica,GCF_002959915.1,negative
Veillonella parvula,GCF_000024945.1,negative
Xanthomonas citri,GCF_002018575.1,negative


In [10]:
# Save the accession id list
accessions = genome_info['assembly_accession'].to_list()

# Specify the file path
accession_path = output_path + 'accession.list'

# Write the list to a text file
with open(accession_path, 'w') as f:
    for accession in accessions:
        f.write("%s\n" % accession)

In [None]:
# Performed in the terminal
# conda activate bit
# bit-dl-ncbi-assemblies -w accession.list -j 80 -f fasta
# gunzip *.gz

# Run prokka



In [19]:
carve_file_path = model_path + 'generate_models.sh'
annotated_file_path = '/mnt/Local_Disk_1/Hospital_Microbiome/Data/Modeling/Annotated_genomes/'
with open(carve_file_path, "w") as f:
    f.write("#!/bin/bash\n")
    for idx, row in genome_info.iterrows():
        f.write(f"carve {annotated_file_path}{row['assembly_accession']}.faa -o {idx.replace(' ', '_')}.xml --solver cplex -u {'grampos' if row['gram_staining'] == 'positive' else 'gramneg'}\n")
        f.write(f"echo {idx} model is done\n")
# Print the completion message
print(f"Generated bash commands written to {carve_file_path}")

Generated bash commands written to /mnt/Local_Disk_1/Hospital_Microbiome/Data/Modeling/Models/generate_models.sh


In [20]:
# Run the bash script
os.chdir(model_path)

!chmod +x {carve_file_path}
!{carve_file_path}

Abiotrophia defectiva model is done
Achromobacter xylosoxidans model is done
Acinetobacter baumannii model is done
Acinetobacter johnsonii model is done
Acinetobacter junii model is done
Acinetobacter lwoffii model is done
Acinetobacter parvus model is done
Acinetobacter schindleri model is done
Acinetobacter ursingii model is done
Actinomyces oris model is done
Actinomyces viscosus model is done
Aerococcus viridans model is done
Bacteroides thetaiotaomicron model is done
Bacteroides uniformis model is done
Bifidobacterium adolescentis model is done
Bifidobacterium breve model is done
Bifidobacterium longum model is done
Brachybacterium muris model is done
Brachybacterium paraconglomeratum model is done
Brevibacterium casei model is done
Brevibacterium ravenspurgense model is done
Brevundimonas diminuta model is done
Caulobacter vibrioides model is done
Chroococcidiopsis thermalis model is done
Collinsella aerofaciens model is done
Corynebacterium accolens model is done
Corynebacterium

In [21]:
# Check all generated models
models = glob.glob(model_path + '*.xml')
models = [item.replace(model_path, '') for item in models]
models = [item.replace('.xml', '') for item in models]
models = [item.replace('_', ' ') for item in models]

models

# Check non-generated models
model_ng = list(set(genome_info.index) - set(models))
model_ng

# Print accession for the genomes
missing_genome_info = genome_info.loc[model_ng]
missing_genome_info

Unnamed: 0_level_0,assembly_accession,gram_staining
organism_name,Unnamed: 1_level_1,Unnamed: 2_level_1
