Inputs: gff3 file & fasta files

Outputs: set of fasta files (CDS, protein, gene), for both primary contigs and haplotigs independently and combined. All fasta files will use ID tags as headers (as opposed to locus tags).

In [1]:
import pandas as pd
import os
import re

GENOME_VERSION = 'v04'

LOCUS_SEARCH = re.compile(r'^.*locus_tag=(.*?)(;|$)')

GENOME_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_%s/' % GENOME_VERSION
GENOME = 'DK_0911_%s' % GENOME_VERSION

H_CTG_GFF_LOC = os.path.join(GENOME_PATH, GENOME + '_h_ctg.anno.gff3')
P_CTG_GFF_LOC = os.path.join(GENOME_PATH, GENOME + '_p_ctg.anno.gff3')

pgff_df = pd.read_table(P_CTG_GFF_LOC, header = None,  \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
hgff_df = pd.read_table(H_CTG_GFF_LOC, header = None,  \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

In [2]:
# extract only genes
pgff_genes_df = pgff_df[pgff_df['type'] == 'gene']
hgff_genes_df = hgff_df[hgff_df['type'] == 'gene']

In [3]:
def getBedDf(gff_genes_df):
    '''returns a BED6 DataFrame from gff gene-only DataFrame'''
    bed_df = gff_genes_df.loc[:, ['seqid', 'start', 'end', 'attributes', 'score', 'strand']]
    bed_df = bed_df.rename(index=str, columns={'seqid':'chrom', 'start':'chromStart', 'end':'chromEnd', 'attributes':'name'})

    bed_df['chromStart'] = bed_df['chromStart'] - 1
    bed_df['name'] = bed_df['name'].apply(lambda s: LOCUS_SEARCH.match(s).group(1))
    bed_df.set_index('chrom', inplace = True)
    
    return bed_df
pbed_df = getBedDf(pgff_genes_df)
hbed_df = getBedDf(hgff_genes_df)

In [4]:
def writeDfToBed(out_name, bed_df, genome_path = GENOME_PATH):
    '''writes a bed DataFrame to a BED file'''
    with open(os.path.join(genome_path, out_name), 'w') as outfile:
        bed_df.to_csv(outfile, sep='\t', header = False)

writeDfToBed('%s_p_ctg.gene.bed' % GENOME, pbed_df)
writeDfToBed('%s_h_ctg.gene.bed' % GENOME, hbed_df)

In [5]:
# cd /home/gamran/genome_analysis/Warrior/Richard/output/genome_v03
os.chdir(GENOME_PATH)
!bedtools getfasta -name -s -fi {GENOME}_h_ctg.fa -bed {GENOME}_h_ctg.gene.bed -fo {GENOME}_h_ctg.gene.long_header.fa
!bedtools getfasta -name -s -fi {GENOME}_p_ctg.fa -bed {GENOME}_p_ctg.gene.bed -fo {GENOME}_p_ctg.gene.long_header.fa 
# Headers look like:
# DK0911_18236::hcontig_000_001:131-2632(-)

In [6]:
from Bio import SeqIO

def changeHeaders(in_file, out_file, genome_path = GENOME_PATH):
    '''Changing headers from in_file to look like: DK0911_00000 in out_file.
    Deletes in_file after finished (long_header file)'''
    os.chdir(genome_path)
    with open(in_file, "r") as input_handle, open(out_file, "w") as output_handle:
        sequences = [s for s in SeqIO.parse(input_handle, "fasta")]
        for s in sequences:
            s.id = s.id[:12]
            s.description = ''
        SeqIO.write(sequences, output_handle, "fasta")
    os.remove(in_file)

changeHeaders("%s_h_ctg.gene.long_header.fa" % GENOME, "%s_h_ctg.gene.fa" % GENOME)
changeHeaders("%s_p_ctg.gene.long_header.fa" % GENOME, "%s_p_ctg.gene.fa" % GENOME)

In [7]:
# Generate PROTEIN & CDS fasta files
os.chdir(GENOME_PATH)
!write_fasta_from_gff.py -i {GENOME}_p_ctg.anno.gff3 -o {GENOME}_p_ctg.protein.fa -t protein -f {GENOME}_p_ctg.fa
!write_fasta_from_gff.py -i {GENOME}_h_ctg.anno.gff3 -o {GENOME}_h_ctg.protein.fa -t protein -f {GENOME}_h_ctg.fa
!write_fasta_from_gff.py -i {GENOME}_h_ctg.anno.gff3 -o {GENOME}_h_ctg.cds.fa -t cds -f {GENOME}_h_ctg.fa
!write_fasta_from_gff.py -i {GENOME}_p_ctg.anno.gff3 -o {GENOME}_p_ctg.cds.fa -t cds -f {GENOME}_p_ctg.fa

In [8]:
os.chdir('/home/gamran/genome_analysis/Warrior/Richard/scripts')
%run DK_0911_dictionaries.ipynb

In [9]:
## At this point, all fasta files have 'locus_tag' from the gff3 file 'attributes' column as
## their sequence identifier. This is changed to the id tag using
## functions in the DK_0911_dictionaries.ipynb

# 1) Create combined (primary + haplotig) fasta files, with ID tags as headers
# 2) Re-write independent fasta files (primary / haplotig) with ID tags as headers

P_GENE_FA = os.path.join(GENOME_PATH, '%s_p_ctg.gene.fa' % GENOME)
P_PROTEIN_FA = os.path.join(GENOME_PATH, '%s_p_ctg.protein.fa' % GENOME)
P_CDS_FA = os.path.join(GENOME_PATH, '%s_p_ctg.cds.fa' % GENOME)

H_GENE_FA = os.path.join(GENOME_PATH, '%s_h_ctg.gene.fa' % GENOME)
H_PROTEIN_FA = os.path.join(GENOME_PATH, '%s_h_ctg.protein.fa' % GENOME)
H_CDS_FA = os.path.join(GENOME_PATH, '%s_h_ctg.cds.fa' % GENOME)

PH_PROTEIN_FA = os.path.join(GENOME_PATH, '%s_ph_ctg.protein.fa' % GENOME)
PH_GENE_FA = os.path.join(GENOME_PATH, '%s_ph_ctg.gene.fa' % GENOME)
PH_CDS_FA = os.path.join(GENOME_PATH, '%s_ph_ctg.cds.fa' % GENOME)

changeFastaToIdTag([P_GENE_FA, H_GENE_FA], PH_GENE_FA)
changeFastaToIdTag([P_PROTEIN_FA, H_PROTEIN_FA], PH_PROTEIN_FA)
changeFastaToIdTag([P_CDS_FA, H_CDS_FA], PH_CDS_FA)

changeFastaToIdTag([P_GENE_FA], P_GENE_FA)
changeFastaToIdTag([P_PROTEIN_FA], P_PROTEIN_FA)
changeFastaToIdTag([P_CDS_FA], P_CDS_FA)

changeFastaToIdTag([H_GENE_FA], H_GENE_FA)
changeFastaToIdTag([H_PROTEIN_FA], H_PROTEIN_FA)
changeFastaToIdTag([H_CDS_FA], H_CDS_FA)

True