In [1]:
from Bio import SeqIO
import sys
import os
import pandas as pd
import numpy as np
from collections import Counter 
import re
import itertools
import fnmatch

In [78]:
### Parse original GBK files ###

# input files
path_raw = "/Users/sigalova/Desktop/Chlamydia_pangenome/data/genomes_apr_2019_raw"
gbk_filenames_raw = fnmatch.filter(os.listdir(path_raw), '*.gbff')

# data.frame to write info
genomes_raw = pd.DataFrame(columns = ["assembly_id", "accession", "id", "DNA_length", "full_name"])
i = 0

# output file
outf_raw = "/Users/sigalova/Desktop/Chlamydia_pangenome/info/genome_info_by_contig_raw_apr2019.csv"
try:
    os.remove(outf_raw)
except:
    pass

for fr in gbk_filenames_raw:
    
    file_raw = path_raw + "/" + fr
    ih_r  = open(file_raw, "r")
    
    assembly_id = "_".join(fr.split("_")[0:2])

    gb_records = list(SeqIO.parse(ih_r, "genbank")) # relevant for gbk files with multiple contigs

    for gb_record in gb_records:
        l = [assembly_id, gb_record.name, gb_record.id, str(len(gb_record.seq)), gb_record.description ]
        genomes_raw.loc[i] = np.array(l)
        i += 1
        
genomes_raw.to_csv(outf_raw, index = False)        

In [77]:
### Parse re-annotated GBK files from RAST ###

# input files
path_annot = "/Users/sigalova/Desktop/Chlamydia_pangenome/data/genomes_combined_filt"
gbk_filenames_annot = fnmatch.filter(os.listdir(path_annot), 'D*.gbk')

# data.frame to write info
genomes_annot = pd.DataFrame(columns = ["genome_id", "accession"])
i = 0

# output file
outf_annot = "/Users/sigalova/Desktop/Chlamydia_pangenome/info/genome_info_by_contig_annot_filt_apr2019.csv"
try:
    os.remove(outf_anot)
except:
    pass

for fa in gbk_filenames_annot:

    ih_r  = open(path_annot + "/" + fa, "r")
    genome_id = fa[0:-4]

    gb_records = list(SeqIO.parse(ih_r, "genbank")) # relevant for gbk files with multiple contigs

    for gb_record in gb_records:
        l = [genome_id, gb_record.name]
        genomes_annot.loc[i] = np.array(l)
        i += 1     
        
genomes_annot.to_csv(outf_annot, index = False)          