In [8]:
import json
from Bio import SeqIO
from pprint import pprint
import os


In [9]:
try:
    metadata_file = open("../data/metadata.json", "r")
except FileNotFoundError:
    print("\033[91m\n" + "No metadata file found, aborting" + "\n\033[0m")

try:
    trg_file = open("../large_data/trg.json", "r")
except FileNotFoundError:
    print("\033[91m\n" + "No trg file found, aborting" + "\n\033[0m")
metadata = json.load(metadata_file)
specific_genes_data = json.load(trg_file)

metadata_file.close()
trg_file.close()


In [10]:
genuses = dict()
for genome in metadata:
    gn = metadata[genome]
    genus = gn["lineage"][5]
    species = [genome, set()]
    if genus in genuses:
        genuses[genus].append(species)
    else:
        genuses[genus] = [species]


In [11]:
# base_path = "/home/wojdob/Desktop/other/masters/large_data/genuses/"
# for genus in genuses:
#     os.makedirs(os.path.join(base_path, genus))


In [12]:
for genus in genuses:
    for species in genuses[genus]:
        if species[0] in specific_genes_data:
            for protein in specific_genes_data[species[0]]:
                if protein[1] == 0:
                    species[1].add(protein[0])


In [13]:
for genus in genuses:
    output_file = f"../large_data/genuses/{genus}/{genus}_specific_genes.fasta"
    with open(output_file, "a") as handle:
        for species_tuple in genuses[genus]:
            wanted = species_tuple[1]
            input_file = f"../large_data/protein_faa_reps/bacteria/{species_tuple[0]}_protein.faa"
            records = (r for r in SeqIO.parse(input_file, "fasta") if r.id in wanted)


            for record in list(records):
                record.description = f"{species_tuple[0]}|{record.id}"
                record.id = record.description

            count = SeqIO.write(records, handle, "fasta")
            if count < len(wanted):
                print(
                    "Warning %i IDs not found in %s" % (len(wanted) - count, input_file)
                )


In [15]:
for genus in genuses:
    output_file = f"../large_data/all_genus_specific_proteins.fasta"
    with open(output_file, "a") as handle:
        for species_tuple in genuses[genus]:
            wanted = species_tuple[1]
            input_file = f"../large_data/protein_faa_reps/bacteria/{species_tuple[0]}_protein.faa"
            records = (r for r in SeqIO.parse(input_file, "fasta") if r.id in wanted)


            count = SeqIO.write(records, handle, "fasta")
            if count < len(wanted):
                print(
                    "Warning %i IDs not found in %s" % (len(wanted) - count, input_file)
                )
