In [1]:
import pandas as pd
from Bio import SeqIO; from Bio.Seq import Seq; from Bio.SeqRecord import SeqRecord; from Bio.SeqFeature import SeqFeature, FeatureLocation
import os
import re

### Refs
* https://biopython.org/wiki/SeqRecord
* https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html


In [2]:
# latest paftol export containing ENA sample accessions
pft_file = '../PAFTOL_DB/2022-03-14_paftol_export_stats.csv'
cptree_file = 'cpTree_v7/cpTree_v7_tree_clean.csv'
org_dir = 'C:/Data/PAFTOL/Organelles/'

# DataSource = 'GAP' 
# ENA_project_number = 'PRJEB49212'
DataSource = 'PAFTOL' 
ENA_project_number = 'PRJEB51960' #https://www.ncbi.nlm.nih.gov/bioproject/588607
project_locus_tag = 'PRJEB51960'

In [3]:
## All Validated Samples
R = pd.read_csv(cptree_file)
R = R[R.DataSource==DataSource]
subset = R.Sample_Name.to_list()
print(R.shape[0],subset[:2])

3451 ['PAFTOL_004799', 'PAFTOL_000992']


## Load data and subset

In [4]:
pdir = DataSource + '/'
sdir = 'ENA_submissions/' + pdir
if not os.path.exists(sdir):
    os.makedirs(sdir)

In [5]:
# Load GetOrg result table
Org_df = pd.read_csv(org_dir + DataSource + '/' + DataSource + '_Organelle_Recovery.csv').astype({'idSequencing':'int'})
if DataSource == 'PAFTOL':
    Org_df['SeqID'] = Org_df.Sample_Name.str.replace('PAFTOL_','Pis_')
elif DataSource in ['GAP']:
    Org_df['SeqID'] = Org_df.Sample_Name
elif DataSource in ['SRA']:
    sra2_pt = pd.read_csv('cpSRA_GetOrg/cpSRA_GetOrg_Organelle_Recovery.csv')
    Org_df = pd.concat([Org_df,sra2_pt],ignore_index=True)
    Org_df = Org_df.groupby('Sample_Name').head(1)
    Org_df['SeqID'] = Org_df.Sample_Name
print(Org_df.shape[0])
Org_df[:2]

9715


Unnamed: 0,idPaftol,idSequencing,Sample_Name,Project,order,family,genus,species,Taxonomical_Notes,SumContigLength,...,Coverage_Kmer,Coverage_base,Run_Time,maxK,NRepeat_Pattern,NPath,Redo_FastPlast,error_pt,error_nr,SeqID
0,2,961,PAFTOL_000961,Pilot,Ranunculales,Circaeasteraceae,Circaeaster,agrestis,,138795.0,...,59.4,90.7,329.43,105.0,,,False,,,Pis_000961
1,4,2571,PAFTOL_002571,Connaraceae,Oxalidales,Connaraceae,Rourea,minor,,213954.0,...,18.5,60.2,747.57,105.0,,,False,,,Pis_002571


In [6]:
if subset is not None:
    Org_df = Org_df[Org_df.Sample_Name.isin(subset)]
    print(Org_df.shape[0],Org_df.Sample_Name.nunique())

3451 3451


In [7]:
# Load paftol export and merge ena sample IDs
db = pd.read_csv(pft_file)
db = db[db.idPaftol.notnull()].astype({'idPaftol':'int','idSequencing':'int'})
db = db[['Sample_Name','idPaftol', 'DataSource', 'Project', 'Order', 'Family', 'Genus', 'Species', 'Taxonomical_Notes', 
         'ENASampleNum', 'idSequencing', 'ExternalSequenceID', 'ENAExpNumber',
       'ENARunNumber', 'NumReads', 'NumRecoveredGenes', 'SumContigLength','NCBI_TaxID', 'NCBI_sciname']]
#        'title', 'taxId','id', 'alias', 'firstCreated', 'firstPublic', 'releaseStatus', 'secondaryId', 'scientificName', 'commonName']]
db[:2]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,Sample_Name,idPaftol,DataSource,Project,Order,Family,Genus,Species,Taxonomical_Notes,ENASampleNum,idSequencing,ExternalSequenceID,ENAExpNumber,ENARunNumber,NumReads,NumRecoveredGenes,SumContigLength,NCBI_TaxID,NCBI_sciname
0,PAFTOL_000564,1,PAFTOL,Asteraceae,Asterales,Asteraceae,Gymnolaena,sp.,Initially recorded as Gymnolaena litoralis but...,,564,,,,,,,,
1,PAFTOL_000961,2,PAFTOL,Pilot,Ranunculales,Circaeasteraceae,Circaeaster,agrestis,,ERS5501508,961,,ERX4839538,ERR5033191,625228.0,332.0,138795.0,39288.0,Circaeaster agrestis


In [8]:
Org_df = pd.merge(Org_df,
         db[['Sample_Name','DataSource','ENASampleNum','ENAExpNumber','ENARunNumber','NCBI_TaxID','NCBI_sciname']],
        how='left',on='Sample_Name').groupby('Sample_Name').head(1)
print(Org_df.isna().sum().to_dict())
print(Org_df.shape[0],Org_df.Sample_Name.nunique())

3451 3451


In [9]:
geseq_dir = sdir + 'fasta/'
if not os.path.exists(geseq_dir):
    os.makedirs(geseq_dir)

# Whole plastomes
### Annotation

In [None]:
whole_pt = Org_df[(Org_df.Assembly.isin(['circular genome','circular genome with gaps']) & (Org_df.Sum_len_pt>80000))]
print(whole_pt.shape[0],'samples',whole_pt.genus.nunique(),'genera')
whole_pt[:2]

In [None]:
# Output fasta files 
for idx, row in whole_pt.iterrows():
    seqs = list(SeqIO.parse(org_dir + DataSource + '/' + 'fasta_pt/' + row.Sample_Name + '_pt.fasta',format='fasta'))
    # Verif only 1 sequence
    if len(seqs) > 1:
        print('error, more than 1 sequence')
        pass
    seq = seqs[0]
    print(row.Sample_Name, row.SeqID, row.family, row.sci_name,len(seq),seq.seq.count('N'))
    seq.id = row.SeqID
    seq.description = ''
    SeqIO.write(seq, geseq_dir + row.SeqID + '-circular.fasta',format='fasta')

Submit files to GeSeq https://chlorobox.mpimp-golm.mpg.de/geseq.html and extract zip file in wdir/GeSeq_output

In [None]:
whole_pt.to_csv(sdir + DataSource + '_wholePT.csv',index=False)

# Contigs

In [None]:
references = pd.read_csv('References/NCBI_plastid_ref/NCBI_plastid_ref_list_SELECTED.csv')
print(references.shape)
references.head(2)

In [None]:
incomplete_pt = Org_df[(Org_df.Assembly.isin(['circular genome','circular genome with gaps'])==False) &
                   (Org_df.pt_recovered==True)]
print(incomplete_pt.shape[0],'samples',incomplete_pt.genus.nunique(),'genera')
incomplete_pt = incomplete_pt[incomplete_pt.Sum_len_pt<200000].set_index('Sample_Name')
print(incomplete_pt.shape[0],'samples',incomplete_pt.genus.nunique(),'genera')
### Random subset for test 
incomplete_pt = incomplete_pt.sample(n=100)
print(incomplete_pt.shape[0],'samples',incomplete_pt.genus.nunique(),'genera')
incomplete_pt[:2]

In [None]:
references[references.order=='Sapindales']

In [None]:
order_dc = {'Gunnerales':'Dilleniales','Picramniales':'Sapindales'}

In [None]:
commands = []
for Sample_Name, row in incomplete_pt.iterrows():
    print(Sample_Name,row.order,row.family)
    ### Identify best reference
    # check if reference in same family
    best_ref = references[references.family==row.family]
    if best_ref.shape[0]<1:
        # otherwise order
        best_ref = references[references.order==row.order]
        if best_ref.shape[0]<1:
            #Otherwise use closest order
            best_ref = references[references.order==order_dc[row.order]]
    print('\t Ref',best_ref.shape, end = ': ')
    best_ref = best_ref.iloc[0]
    print(best_ref.SeqID,best_ref.order,best_ref.family)
    incomplete_pt.loc[Sample_Name,'Ref-SeqID'] = best_ref.SeqID
    incomplete_pt.loc[Sample_Name,'Ref-order'] = best_ref.order
    incomplete_pt.loc[Sample_Name,'Ref-family'] = best_ref.family
    
    ### launch blastn
    # program_path = 'C:/Program Files/NCBI/blast-BLAST_VERSION+/bin/blastn.exe'
    program_path = 'blastn'
    ref_path = 'References/NCBI_plastid_ref/' + best_ref['SeqID'] + '.fasta'
    contigs_path = '../Organelles/' + DataSource + '/fasta_pt/' + Sample_Name + '_pt.fasta'
    out_path = 'ENA_submissions/' + DataSource + '/out_blast/'+ Sample_Name + '.blast'
    command = program_path + ' -query ' + contigs_path + ' -subject ' + ref_path + \
                ' -outfmt "6 qseqid sseqid pident length slen qlen mismatch gapopen qstart qend sstart send evalue bitscore"' + \
                ' -out ' + out_path
    commands.append(command)
    os.system(command)
    
    ### read blast output
    blast_output = pd.read_table(out_path, sep='\t',header=None)
    blast_output.columns = ['qseqid','sseqid','pident','length','slen','qlen','mismatch','gapopen','qstart','qend','sstart','send','evalue','bitscore']
#     print('\t',blast_output.shape[0],'matched for', blast_output.qseqid.nunique(),'qseqid',end='; ')
    incomplete_pt.loc[Sample_Name,'Blast-N_qseqid_match'] = blast_output.qseqid.nunique()
    # Keep best match (by bitscore) per queried contig
    blast_output = blast_output.sort_values('bitscore',ascending=False).groupby('qseqid').head(1)
    blast_output['oriented_sstart'] = blast_output[['sstart','send']].min(axis=1); blast_output['oriented_ssend'] = blast_output[['sstart','send']].max(axis=1)
    blast_output['Need_RevComp'] = blast_output.sstart > blast_output.oriented_sstart
    blast_output = blast_output.sort_values('oriented_sstart').set_index('qseqid')
#     print('Contigs to reverse complement:',blast_output.Need_RevComp.value_counts().to_dict())
    
    
    ### Load contigs and write concatenated fasta
    gapN = 100
    records = SeqIO.to_dict(SeqIO.parse(contigs_path, "fasta"))
    print('\t',blast_output.shape[0],'/',len(records),'contigs mapped to reference')
    concat_seq = ''
    for qseqid, blast_row in blast_output.iterrows():
        if blast_row.Need_RevComp==True:
            concat_seq += records[qseqid].seq.reverse_complement()
        else:
            concat_seq += records[qseqid].seq 
        concat_seq += Seq(gapN * 'N')
    concat_seq = concat_seq[:-gapN] # Remove trailing Ns
    print('\t sum len concat:',blast_output.qlen.sum(),'/',int(row.Sum_len_pt),round(blast_output.qlen.sum()/int(row.Sum_len_pt)*100,1),'%')
    incomplete_pt.loc[Sample_Name,'Concat-len'] = blast_output.qlen.sum()
    incomplete_pt.loc[Sample_Name,'Concat-%_mapped'] = round(blast_output.qlen.sum()/int(row.Sum_len_pt)*100,1)
    incomplete_pt.loc[Sample_Name,'Concat-len_with_Ns'] = len(concat_seq)
    Concat_record = SeqRecord(
        concat_seq,
        id=row.SeqID + '_concat_pt',
        name=Sample_Name + '_concat_pt',
        description=''
    )
    SeqIO.write(Concat_record,'ENA_submissions/' + DataSource + '/fasta/' + row.SeqID + '-concat.fasta','fasta')

In [None]:
incomplete_pt.to_csv(sdir + DataSource + '_concatPT.csv')

### ENA submission
https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html?highlight=chloroplast#



In [11]:
paper_title = 'Building the largest, curated plastid genome dataset and phylogeny by leveraging nuclear-oriented sequence data'
paper_authors = 'Leempoel K., Zuntini A. R., Bailey P., Eiserhardt W. L., Gallego B., Maurin O., Allnutt T., Bellot S., Clarkson J. J., ' + \
        'Clements M., Cowan R. S., Crayn D. D., de Lírio E., Françoso E., Höwener A., Hu A., Joyce E., Larson D., Li H., Li D., Liu J., Lum M., McGinnie C., ' + \
        'McLay T., Nargar K., Perez Escobar O., Przelomska N., Roy S., Schmidt-Lebuhn A., Shah T., Simpson L., Yang J., Yi T., Forest F., Kersey P. J., ' + \
        'Leitch I. J., Baker W. J.'
paper_journal = 'Unpublished'
paper_id = 'NA'
paper_authors

'Leempoel K., Zuntini A. R., Bailey P., Eiserhardt W. L., Gallego B., Maurin O., Allnutt T., Bellot S., Clarkson J. J., Clements M., Cowan R. S., Crayn D. D., de Lírio E., Françoso E., Höwener A., Hu A., Joyce E., Larson D., Li H., Li D., Liu J., Lum M., McGinnie C., McLay T., Nargar K., Perez Escobar O., Przelomska N., Roy S., Schmidt-Lebuhn A., Shah T., Simpson L., Yang J., Yi T., Forest F., Kersey P. J., Leitch I. J., Baker W. J.'

In [10]:
# submit_pt = pd.concat([pd.read_csv(sdir + DataSource + '_wholePT.csv'), pd.read_csv(sdir + DataSource + '_concatPT.csv')],ignore_index=True)
# submit_pt = pd.read_csv(sdir + DataSource + '_wholePT.csv')
submit_pt = pd.read_csv(sdir + DataSource + '_concatPT.csv')
print(submit_pt.shape[0])
submit_pt = submit_pt[(submit_pt.ENASampleNum.notnull()) & (submit_pt.ENARunNumber.notnull())]
print(submit_pt.shape[0])
submit_pt = submit_pt[submit_pt.NCBI_sciname.notnull()]
print(submit_pt.shape[0],submit_pt.Assembly.value_counts().to_dict())
submit_pt.head(2)

100
97
93 {'scaffold': 93}


Unnamed: 0,Sample_Name,idPaftol,idSequencing,Project,order,family,genus,species,Taxonomical_Notes,SumContigLength,...,ENARunNumber,NCBI_TaxID,NCBI_sciname,Ref-SeqID,Ref-order,Ref-family,Blast-N_qseqid_match,Concat-len,Concat-%_mapped,Concat-len_with_Ns
0,PAFTOL_007905,4838,7905,Celastraceae,Celastrales,Celastraceae,Euonymus,australianus,Euonymus-Celastrus-Haydenoxylon clade,201111.0,...,ERR7619260,1089391.0,Euonymus australianus,NC_045280.1,Celastrales,Celastraceae,30.0,24986.0,98.3,27886.0
1,PAFTOL_014479,8655,14479,Asteraceae,Asterales,Asteraceae,Olgaea,leucophylla,,182634.0,...,ERR7621142,591993.0,Olgaea leucophylla,NC_044728.1,Asterales,Asteraceae,30.0,36976.0,100.0,39876.0


In [12]:
all_seq = pd.read_csv('cpTree_v7/AllSamples_Allgenes_All_SeqTable.csv')
print(all_seq.shape)
genes_82 = pd.read_table('cp_genes_82.txt',header=None).rename(columns={0:'gene'})
all_seq = all_seq[all_seq.gene.isin(genes_82.gene)]
print(all_seq.shape)
all_seq.head(2)

(1678470, 4)
(1172782, 4)


Unnamed: 0,seqid,len,Sample_Name,gene
0,GAP_026547-rrn23,2812,GAP_026547,rrn23
1,GAP_026547-rrn16,1491,GAP_026547,rrn16


In [13]:
def list_genes(record, ls_types = ['gene','CDS','rRNA']):
    # list gene names
    feat_ls = []; idx = 0
    for feature in record.features:
        if feature.type in ls_types:
            feat_ls.append(feature.qualifiers['gene'][0])
        idx += 1
    return list(set(feat_ls))

In [14]:
# Rename and modify genbank files, convert to embl flat file
if not os.path.exists(sdir + '/Genbank_files/'):
    os.makedirs(sdir + '/Genbank_files/')
gb_files = [file for file in os.listdir(sdir + 'geseq/') if file.endswith('.gb')]
print(len(gb_files))
for idx, row in submit_pt.iterrows():
    idx_file = [i for i, x in enumerate([row.SeqID in file for file in gb_files]) if x]
    print(row.Sample_Name,gb_files[idx_file[0]],end=':')
    records = list(SeqIO.parse(sdir + 'geseq/' + gb_files[idx_file[0]], "genbank"))
    record = records[0]
    record_genes = list_genes(record); record_genes = [gene for gene in record_genes if gene in genes_82.gene.to_list()]
    blast_genes = all_seq[all_seq.Sample_Name==row.Sample_Name]
    print(len(records), len(records[0]), len(record_genes),'/',blast_genes.gene.nunique(),
         int(len(record_genes)/blast_genes.gene.nunique()*100),'%',blast_genes[blast_genes.gene.isin(record_genes)==False].gene.unique())
    try:
        if row.Assembly in ['circular genome','circular genome with gaps']:
            record.description = row.ENARunNumber + ' chloroplast, complete genome'
        elif row.Assembly in ['scaffold']:
            record.description = row.ENARunNumber + ' chloroplast, partial genome (concatenated contigs)'
            record.annotations['topology'] = 'linear'
        submit_pt.loc[idx,'ENA_SeqID'] = row.ENARunNumber + '_pt'
        record.id = submit_pt.loc[idx,'ENA_SeqID']
        record.name = submit_pt.loc[idx,'ENA_SeqID']
        record.version = submit_pt.loc[idx,'ENA_SeqID'] + '.1'
        record.annotations['source'] = row.ENASampleNum + ', ' + row.sci_name + ', isolate'
        record.annotations['accessions'] = row.ENASampleNum
        record.annotations['organism'] = row.NCBI_sciname
        record.annotations["data_file_division"]="PLN";
        record.annotations.pop('taxonomy')
        record.annotations['references'][0].title = paper_title    
        record.annotations['references'][0].authors = paper_authors    
        record.annotations['references'][0].journal = paper_journal      
        record.annotations['references'][0].pubmed_id = paper_id    
        record.features[0].qualifiers['organism'] = row.NCBI_sciname
        record.features[0].qualifiers['db_xref'] = ['taxon:' + str(int(row['NCBI_TaxID']))]
        SeqIO.write(record,sdir + 'Genbank_files/' + row.SeqID + '_pt.gb',format='genbank')
    except:
        print('issue with ',gb_files[idx_file[0]])

439
PAFTOL_007905 GeSeqJob-20220305-120228_Pis_007905_concat_pt_GenBank.gb:1 27886 30 / 16 187 % []
PAFTOL_014479 GeSeqJob-20220305-120228_Pis_014479_concat_pt_GenBank.gb:1 39876 29 / 26 111 % []
PAFTOL_006622 GeSeqJob-20220305-120228_Pis_006622_concat_pt_GenBank.gb:1 158632 77 / 77 100 % ['rps8']
PAFTOL_010719 GeSeqJob-20220305-120228_Pis_010719_concat_pt_GenBank.gb:1 56570 32 / 31 103 % ['rpoB' 'rps8']
PAFTOL_012011 GeSeqJob-20220305-120228_Pis_012011_concat_pt_GenBank.gb:1 127768 78 / 77 101 % []
PAFTOL_004680 GeSeqJob-20220305-120228_Pis_004680_concat_pt_GenBank.gb:1 63389 42 / 35 120 % ['matK']
PAFTOL_018841 GeSeqJob-20220305-120228_Pis_018841_concat_pt_GenBank.gb:1 58859 45 / 40 112 % ['rps3']
PAFTOL_015265 GeSeqJob-20220305-120228_Pis_015265_concat_pt_GenBank.gb:1 76256 63 / 55 114 % []
PAFTOL_000975 GeSeqJob-20220305-120228_Pis_000975_concat_pt_GenBank.gb:1 39785 28 / 17 164 % []
PAFTOL_004459 GeSeqJob-20220305-120228_Pis_004459_concat_pt_GenBank.gb:1 89459 64 / 58 110 % ['psbA



1 164213 78 / 78 100 % []
issue with  GeSeqJob-20220305-120228_Pis_012985_concat_pt_GenBank.gb
PAFTOL_023435 GeSeqJob-20220305-120228_Pis_023435_concat_pt_GenBank.gb:1 74401 66 / 37 178 % []
PAFTOL_014813 GeSeqJob-20220305-120228_Pis_014813_concat_pt_GenBank.gb:1 40013 38 / 24 158 % []
PAFTOL_004390 GeSeqJob-20220305-120228_Pis_004390_concat_pt_GenBank.gb:1 70143 68 / 48 141 % []
PAFTOL_008709 GeSeqJob-20220305-120228_Pis_008709_concat_pt_GenBank.gb:1 140960 75 / 76 98 % ['ndhA']
issue with  GeSeqJob-20220305-120228_Pis_008709_concat_pt_GenBank.gb
PAFTOL_010063 GeSeqJob-20220305-120228_Pis_010063_concat_pt_GenBank.gb:1 55109 54 / 43 125 %



 []
PAFTOL_006252 GeSeqJob-20220305-120228_Pis_006252_concat_pt_GenBank.gb:1 46378 26 / 27 96 % ['rpoA' 'rps3' 'atpE' 'rpl22']
PAFTOL_011879 GeSeqJob-20220305-120228_Pis_011879_concat_pt_GenBank.gb:1 107978 75 / 72 104 % []
PAFTOL_006957 GeSeqJob-20220305-120228_Pis_006957_concat_pt_GenBank.gb:1 23966 16 / 11 145 % []
PAFTOL_004696 GeSeqJob-20220305-120228_Pis_004696_concat_pt_GenBank.gb:1 132490 75 / 71 105 % []
PAFTOL_014801 GeSeqJob-20220305-120228_Pis_014801_concat_pt_GenBank.gb:1 36829 20 / 19 105 % ['rps11']
PAFTOL_015539 GeSeqJob-20220305-120228_Pis_015539_concat_pt_GenBank.gb:1 117894 78 / 74 105 % []
PAFTOL_008650 GeSeqJob-20220305-120228_Pis_008650_concat_pt_GenBank.gb:1 42928 32 / 26 123 % []
PAFTOL_007942 GeSeqJob-20220305-120228_Pis_007942_concat_pt_GenBank.gb:1 36090 20 / 19 105 % []
PAFTOL_005730 GeSeqJob-20220305-120228_Pis_005730_concat_pt_GenBank.gb:1 55075 62 / 32 193 % []
PAFTOL_008711 GeSeqJob-20220305-120228_Pis_008711_concat_pt_GenBank.gb:1 40958 37 / 21 176 % []



1 116079 77 / 75 102 % []
PAFTOL_001072 GeSeqJob-20220305-120228_Pis_001072_concat_pt_GenBank.gb:1 55418 37 / 28 132 % []
PAFTOL_006292 GeSeqJob-20220305-120228_Pis_006292_concat_pt_GenBank.gb:1 124260 77 / 74 104 % []
PAFTOL_015271 GeSeqJob-20220305-120228_Pis_015271_concat_pt_GenBank.gb:1 63837 54 / 44 122 % ['cemA']
PAFTOL_006145 GeSeqJob-20220305-120228_Pis_006145_concat_pt_GenBank.gb:1 117306 73 / 70 104 % []
PAFTOL_010261 GeSeqJob-20220305-120228_Pis_010261_concat_pt_GenBank.gb:1 127697 78 / 77 101 % []
PAFTOL_015595 GeSeqJob-20220305-120228_Pis_015595_concat_pt_GenBank.gb:1 127330 75 / 72 104 % []
PAFTOL_005892 GeSeqJob-20220305-120228_Pis_005892_concat_pt_GenBank.gb:1 39093 47 / 28 167 % ['rps19' 'rps15' 'rps16' 'petN']
PAFTOL_006892 GeSeqJob-20220305-120228_Pis_006892_concat_pt_GenBank.gb:1 132519 76 / 72 105 % []
PAFTOL_008832 GeSeqJob-20220305-120228_Pis_008832_concat_pt_GenBank.gb:1 99735 65 / 60 108 % []
PAFTOL_026123 GeSeqJob-20220305-120228_Pis_026123_concat_pt_GenBank.g



1 100706 73 / 65 112 % []
PAFTOL_016943 GeSeqJob-20220305-120228_Pis_016943_concat_pt_GenBank.gb:1 83361 63 / 59 106 % ['matK']
PAFTOL_006738 GeSeqJob-20220305-120228_Pis_006738_concat_pt_GenBank.gb:1 45754 39 / 28 139 % []
PAFTOL_013403 GeSeqJob-20220305-120228_Pis_013403_concat_pt_GenBank.gb:1 114540 63 / 62 101 % []
issue with  GeSeqJob-20220305-120228_Pis_013403_concat_pt_GenBank.gb




In [15]:
def del_features(record, feat_del_ls = ['translation','info','annotator']):
    for feature in record.features:
        for feat_to_del in feat_del_ls:
            if feat_to_del in feature.qualifiers:
                del feature.qualifiers[feat_to_del]
    return record

def del_fragment(record, ls_types = ['gene','CDS','rRNA']):
    # list gene names
    ls_feat = []
    for feature in record.features:
        if feature.type in ls_types:
            ls_feat.append(feature.qualifiers['gene'][0])
    ls_feat = [ifeat for ifeat in ls_feat if 'fragment' not in ifeat]

    # remove fragment feature if full feature is present
    rm_count = 0
    for feature in record.features:
        if feature.type in ls_types:
            if 'fragment' in feature.qualifiers['gene'][0]:
                if feature.qualifiers['gene'][0].split('-')[0] in ls_feat:
                    rm_count += 1
                    record.features.remove(feature)
    print('removed',rm_count,'partial features')
    return record

def del_longfeat(record, ls_types = ['gene','CDS','rRNA'], len_thrs = 5000):
    for feature in record.features:
        if feature.type in ls_types:
            if len(feature)>len_thrs:
                print('removing ',feature.qualifiers['gene'],len(feature))
                record.features.remove(feature)
    return record

def add_gap_features(record, gapN = 100):
    # find gaps in record
    gaps_idx = [m.start() for m in re.finditer(gapN * 'N', str(record.seq))]
    print(len(gaps_idx),'gaps')
    if len(gaps_idx)>0:
        ## for each gap, add a sequence feature
        for gap_pos in gaps_idx:
            # find index of gap in features
            features_start = [feat.location.start for feat in record.features]
            gap_feature_idx = [i for i in range(len(features_start)) if features_start[i] > gap_pos]
            
            gap_feat = SeqFeature(FeatureLocation(gap_pos,gap_pos+gapN), strand=1, type='assembly_gap')
            gap_feat.qualifiers['gap_type']='within scaffold'; 
            gap_feat.qualifiers['estimated_length']='unknown'
            gap_feat.qualifiers['linkage_evidence']='unspecified'
            if len(gap_feature_idx)>0:
                gap_feature_idx = gap_feature_idx[0]
                record.features.insert(gap_feature_idx, gap_feat)
            elif len(gap_feature_idx)==0: #if position is further than any other feature, append at the end of feature list
                record.features.append(gap_feat)
    return record

# https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html
def add_locus_tag(record, locus_tag, ls_types = ['gene','CDS','rRNA','tRNA','intron','exon']):
    locus_tags_genes = {}; idx = 1
    for feature in record.features:
        if feature.type in ls_types:
            if feature.qualifiers['gene'][0] not in locus_tags_genes:
                locus_tags_genes[feature.qualifiers['gene'][0]] = locus_tag + '_' + str(idx)
                idx += 1
            feature.qualifiers['locus_tag']=locus_tags_genes[feature.qualifiers['gene'][0]]
    return record

def gzip_file(in_file):
    import gzip
    import shutil
    f_gz = in_file + '.gz'
    with open(in_file, 'rb') as f_in:
        with gzip.open(f_gz, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [16]:
# ENA files
if not os.path.exists(sdir + '/EMBL_files/'):
    os.makedirs(sdir + '/EMBL_files/')
commands = []
for idx, row in submit_pt.iterrows():
    ### Convert .gb to EMBL flat files
    file_gb = sdir + 'Genbank_files/' + row.SeqID + '_pt.gb'
    print(row.SeqID,row.ENA_SeqID,end=' > ')
    try:
        record = list(SeqIO.parse(file_gb, "genbank"))[0]
        print(len(record))

        # Deleted features preventing ena submission
        record_clean = del_features(record = record, feat_del_ls = ['translation','info','annotator'])
        # Delete fragmentary annotations (e.g. rr16-fragment)
        record_clean = del_fragment(record = record_clean); record_clean = del_fragment(record = record_clean);
        # Delete abnormaly long genes
        record_clean = del_longfeat(record = record_clean,len_thrs=10000);
        # Add gap features for partial plastomes. Recognises gaps of 100 Ns by default
        record_clean = add_gap_features(record = record_clean)
        # Add locus tag if sequence has gaps
        if 'assembly_gap' in [feature.type for feature in record.features]:
              record_clean = add_locus_tag(record = record_clean, locus_tag = project_locus_tag)

        file_embl = sdir + '/EMBL_files/' + row.ENA_SeqID + '.embl'
        SeqIO.write(record_clean, file_gb, "genbank")
        SeqIO.write(record_clean, file_embl, "embl")
        gzip_file(file_embl)

        ### Chromosome list file
        # https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html?highlight=chloroplast#chromosome-list-file
        if row.Assembly in ['circular genome','circular genome with gaps']:
            list_file=row.ENA_SeqID + '	PT	Circular-Chromosome	Chloroplast'       
        elif row.Assembly in ['scaffold']:
            list_file=row.ENA_SeqID + '	PT	Linear-Chromosome	Chloroplast'
        with open(file_embl.replace('.embl','_CHR.txt'), 'wb') as f:
            f.write(list_file.encode("ascii"))
        gzip_file(file_embl.replace('.embl','_CHR.txt'))

        ### Manifest file 
        # https://ena-docs.readthedocs.io/en/latest/submit/assembly/genome.html 
        manifest_file = file_embl.replace('.embl','_manifest.txt')
        if row.Assembly in ['circular genome','circular genome with gaps']:
            tmp_description=' chloroplast, complete genome' 
        elif row.Assembly in ['scaffold']:
            tmp_description=' chloroplast, partial genome (concatenated contigs)' 
        manifest={
            'STUDY':ENA_project_number,
            'SAMPLE':row.ENASampleNum,
            'ASSEMBLYNAME':row.ENA_SeqID,
            'ASSEMBLY_TYPE':'clone or isolate',
            'COVERAGE':row.Coverage_base,
            'PROGRAM':'GetOrganelle v1.7.5 +  GeSeq 2.03',
            'PLATFORM':'ILLUMINA',
            'MOLECULETYPE':'genomic DNA',
            'FLATFILE': row.ENA_SeqID + '.embl.gz',
            'CHROMOSOME_LIST': row.ENA_SeqID + '_CHR.txt.gz',
            'RUN_REF': row.ENARunNumber,
            'DESCRIPTION': row.NCBI_sciname + tmp_description
        }
#         if row.Assembly in ['scaffold']:
#             manifest['PARTIAL']='TRUE'
        manifest = pd.DataFrame.from_dict(manifest,orient='index')
        manifest.to_csv(manifest_file,header=None,sep='\t')

        # Submission command
        commands.append('java -jar C:/Data/PAFTOL/ENA_submissions/webin-cli-4.3.0.jar ' + 
              '-username Webin-52995 -passwordFile C:/Data/PAFTOL/ENA_submissions/ena_pwd.txt -context genome -manifest ' +
              row.ENA_SeqID + '_manifest.txt' + ' -validate') 
    except:
        print('issue',row.SeqID)
pd.DataFrame(commands).to_csv(sdir + '/ENA_Submission_Commands_' + DataSource + '.txt',index=False,header=None)

Pis_007905 ERR7619260_pt > 27886
removed 0 partial features
removed 0 partial features
29 gaps
Pis_014479 ERR7621142_pt > 39876
removed 0 partial features
removed 0 partial features
29 gaps
Pis_006622 ERR7618796_pt > 158632
removed 2 partial features
removed 2 partial features
removing  ['ycf2'] 56526
removing  ['ndhB'] 49707
removing  ['trnI-GAU'] 50686
removing  ['trnA-UGC'] 50527
removing  ['rrn23'] 49689
8 gaps
Pis_010719 ERR7620148_pt > 56570
removed 0 partial features
removed 0 partial features
25 gaps
Pis_012011 ERR7620451_pt > 127768
removed 1 partial features
removed 1 partial features
7 gaps
Pis_004680 ERR5033456_pt > 63389
removed 1 partial features
removed 0 partial features
46 gaps
Pis_018841 ERR7621734_pt > 58859
removed 1 partial features
removed 0 partial features
43 gaps
Pis_015265 ERR7621379_pt > 76256
removed 1 partial features
removed 0 partial features
42 gaps
Pis_000975 ERR4180144_pt > 39785
removed 0 partial features
removed 0 partial features
17 gaps
Pis_004459 

cd C:\Data\PAFTOL\cpDNA\ENA_submissions\DataSource\EMBL_files

then launch commands