In [102]:
# cd /home/gamran/genome_analysis/Warrior/Richard/output/genome_v03
# bedtools getfasta -name -fi DK_0911_v03_h_ctg.fa -bed DK_0911_v03_h_ctg.gene.bed -fo DK_0911_v03_h_ctg.gene.fa
# bedtools getfasta -name -fi DK_0911_v03_p_ctg.fa -bed DK_0911_v03_p_ctg.gene.bed -fo DK_0911_v03_p_ctg.gene.fa 

In [117]:
import pandas as pd
import os
import re

LOCUS_SEARCH = re.compile(r'^.*locus_tag=(.*?)(;|$)')

BASE_DIR = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_v03/'

H_CTG_GFF_LOC = BASE_DIR + 'DK_0911_v03_h_ctg.anno.gff3'
P_CTG_GFF_LOC = BASE_DIR + 'DK_0911_v03_p_ctg.anno.gff3'

pgff_df = pd.read_table(P_CTG_GFF_LOC, header = None,  \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
hgff_df = pd.read_table(H_CTG_GFF_LOC, header = None,  \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

In [119]:
# extract only genes
pgff_genes_df = pgff_df[pgff_df['type'] == 'gene']
hgff_genes_df = hgff_df[hgff_df['type'] == 'gene']

In [169]:
def getBedDf(gff_genes_df):
    '''returns a BED6 DataFrame from gff gene-only DataFrame'''
    bed_df = gff_genes_df.loc[:, ['seqid', 'start', 'end', 'attributes', 'score', 'strand']]
    bed_df = bed_df.rename(index=str, columns={'seqid':'chrom', 'start':'chromStart', 'end':'chromEnd', 'attributes':'name'})

    bed_df['chromStart'] = bed_df['chromStart'] - 1
    bed_df['name'] = bed_df['name'].apply(lambda s: LOCUS_SEARCH.match(s).group(1))
    bed_df.set_index('chrom', inplace = True)
    
    return bed_df
pbed_df = getBedDf(pgff_genes_df)
hbed_df = getBedDf(hgff_genes_df)

In [176]:
def writeDfToBed(out_name, bed_df, base_dir = BASE_DIR):
    '''writes a bed DataFrame to a BED file'''
    with open(os.path.join(base_dir, out_name), 'w') as outfile:
        bed_df.to_csv(outfile, sep='\t', header = False)

writeDfToBed('DK_0911_v03_p_ctg.gene.bed', pbed_df)
writeDfToBed('DK_0911_v03_h_ctg.gene.bed', hbed_df)

In [126]:
# cd /home/gamran/genome_analysis/Warrior/Richard/output/genome_v03
os.chdir(BASE_DIR)
!bedtools getfasta -name -s -fi DK_0911_v03_h_ctg.fa -bed DK_0911_v03_h_ctg.gene.bed -fo DK_0911_v03_h_ctg.gene.long_header.fa
!bedtools getfasta -name -s -fi DK_0911_v03_p_ctg.fa -bed DK_0911_v03_p_ctg.gene.bed -fo DK_0911_v03_p_ctg.gene.long_header.fa 
# Headers look like:
# DK0911_18236::hcontig_000_001:131-2632(-)

In [177]:
from Bio import SeqIO

def changeHeaders(in_file, out_file, base_dir = BASE_DIR):
    '''Changing headers from in_file to look like: DK0911_00000 in out_file.
    Deletes in_file after finished (long_header file)'''
    os.chdir(base_dir)
    with open(in_file, "r") as input_handle, open(out_file, "w") as output_handle:
        sequences = [s for s in SeqIO.parse(input_handle, "fasta")]
        for s in sequences:
            s.id = s.id[:12]
            s.description = ''
        SeqIO.write(sequences, output_handle, "fasta")
    os.remove(in_file)

changeHeaders("DK_0911_v03_h_ctg.gene.long_header.fa", "DK_0911_v03_h_ctg.gene.fa")
changeHeaders("DK_0911_v03_p_ctg.gene.long_header.fa", "DK_0911_v03_p_ctg.gene.fa")