In [2]:
import json
from Bio import SeqIO
from pprint import pprint
import os


In [3]:
try:
    metadata_file = open("../data/metadata.json", "r")
except FileNotFoundError:
    print("\033[91m\n" + "No metadata file found, aborting" + "\n\033[0m")

try:
    trg_file = open("../large_data/trg.json", "r")
except FileNotFoundError:
    print("\033[91m\n" + "No trg file found, aborting" + "\n\033[0m")
metadata = json.load(metadata_file)
specific_genes_data = json.load(trg_file)

metadata_file.close()
trg_file.close()


In [5]:
genuses = dict()
for genome in metadata:
    gn = metadata[genome]
    genus = gn["lineage"][5]
    species = [genome, set()]
    if genus in genuses:
        genuses[genus].append(species)
    else:
        genuses[genus] = [species]


[['RS_GCF_000715205.1', set()], ['RS_GCF_900166645.1', set()], ['RS_GCF_002153395.1', set()], ['RS_GCF_001584325.1', set()], ['RS_GCF_000691145.1', set()], ['RS_GCF_013155385.1', set()], ['GB_GCA_001938995.1', set()], ['RS_GCF_001584335.1', set()], ['RS_GCF_002744245.1', set()], ['RS_GCF_001969855.1', set()], ['RS_GCF_001461825.1', set()], ['RS_GCF_001307105.1', set()], ['RS_GCF_001042475.2', set()], ['RS_GCF_001592005.1', set()], ['RS_GCF_900186955.1', set()], ['RS_GCF_001278705.1', set()], ['RS_GCF_002027305.1', set()], ['RS_GCF_000011645.1', set()], ['RS_GCF_000300535.1', set()], ['RS_GCF_004116955.1', set()], ['RS_GCF_004124315.1', set()], ['RS_GCF_000196735.1', set()], ['GB_GCA_000332645.1', set()], ['RS_GCF_000691165.1', set()], ['RS_GCF_000507145.1', set()], ['RS_GCF_000262045.1', set()], ['RS_GCF_003431975.1', set()], ['RS_GCF_000009045.1', set()], ['RS_GCF_000245335.1', set()], ['RS_GCF_001969815.1', set()], ['RS_GCF_001042485.2', set()], ['RS_GCF_001517105.1', set()], ['RS_GC

In [33]:
base_path = "/home/wojdob/Desktop/other/masters/large_data/genuses/"
for genus in genuses:
    os.makedirs(os.path.join(base_path, genus))


In [34]:
for genus in genuses:
    for species in genuses[genus]:
        if species[0] in specific_genes_data:
            for protein in specific_genes_data[species[0]]:
                if protein[1] == 0:
                    species[1].add(protein[0])


In [35]:
for genus in genuses:
    output_file = f"../large_data/genuses/{genus}/{genus}_specific_genes.fasta"
    with open(output_file, "a") as handle:
        for species_tuple in genuses[genus]:
            wanted = species_tuple[1]
            input_file = f"../large_data/protein_faa_reps/bacteria/{species_tuple[0]}_protein.faa"
            records = tuple(
                (r for r in SeqIO.parse(input_file, "fasta") if r.id in wanted)
            )

            for record in list(records):
                record.description = f"{species_tuple[0]}|{record.id}"
                record.id = record.description

            count = SeqIO.write(records, handle, "fasta")
            if count < len(wanted):
                print(
                    "Warning %i IDs not found in %s" % (len(wanted) - count, input_file)
                )
