In [2]:
import pandas as pd
from Bio import SeqIO; from Bio.Seq import Seq; from Bio.SeqRecord import SeqRecord; from Bio.SeqFeature import SeqFeature, FeatureLocation
import os
import re
#### TESTING

### Refs
* https://biopython.org/wiki/SeqRecord
* https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html


In [3]:
# latest paftol export containing ENA sample accessions
###pft_file = '../../paftol_export/paftol_export_files/2022-06-06_paftol_export.csv'
### Paul B. - NB the June version of the paftol export table verions doesn't have these keys in the index:
### Sample_Name, NCBI_sciname and sci_name, NCBI_TaxID
### It is because Kevin's table has these extra columns added from elsewhere hence 'stats' in ../paftol_export_stats.csv
### pft_file = '../../PAFTOL_DB/2022-08-04_paftol_export_stats.csv' - created updated file here:
pft_file = '../../paftol_export/paftol_export_files/2022-08-04_paftol_export_stats.csv'
### 13.7.2023 - used the above file except for some small exceptions just below:
###pft_file = 'ENA_submissions/PAFTOL/paftol_export_8_missing_samples_ONLY.csv'
cptree_file = 'cpTree_v7/cpTree_v7_tree_clean.csv'
org_dir = 'Organelles/'
 
# ENA_project_number = 'PRJEB49212'
DataSource = 'SRA' # PAFTOL, SRA, GAP
if DataSource == 'PAFTOL':
    ENA_project_number = 'PRJEB51960' #https://www.ncbi.nlm.nih.gov/bioproject/588607
    project_locus_tag = 'PRJEB51960'
elif DataSource == 'SRA':
    ENA_project_number = 'PRJEB55358' #https://www.ncbi.nlm.nih.gov/bioproject/588607
    project_locus_tag = 'PRJEB55358'    
elif DataSource == 'GAP':
    ENA_project_number = 'PRJEB55354' #https://www.ncbi.nlm.nih.gov/bioproject/588607
    project_locus_tag = 'PRJEB55354'

In [4]:
## All Validated Samples
R = pd.read_csv(cptree_file)
R = R[R.DataSource==DataSource]
subset = R.Sample_Name.to_list()
print(R.shape[0],subset[:2])

1611 ['SRR9611034', 'SRR8655284']


## Load data and subset

In [5]:
pdir = DataSource + '/'
sdir = 'ENA_submissions/' + pdir
if not os.path.exists(sdir):
    os.makedirs(sdir)

In [5]:
# Load GetOrg result table
Org_df = pd.read_csv(org_dir + DataSource + '/' + DataSource + '_Organelle_Recovery.csv').astype({'idSequencing':'int'})
if DataSource == 'PAFTOL':
    Org_df['SeqID'] = Org_df.Sample_Name.str.replace('PAFTOL_','Pis_')
elif DataSource in ['GAP']:
    Org_df['SeqID'] = Org_df.Sample_Name
elif DataSource in ['SRA']:
    sra2_pt = pd.read_csv('cpSRA_GetOrg/cpSRA_GetOrg_Organelle_Recovery.csv')
    Org_df = pd.concat([Org_df,sra2_pt],ignore_index=True)
    Org_df = Org_df.groupby('Sample_Name').head(1)
    Org_df['SeqID'] = Org_df.Sample_Name
print(Org_df.shape[0])
Org_df[:2]
###Org_df[ 'order' ]
###Org_df[ 'Sample_Name' ]
###Org_df[ Org_df.Sample_Name == 'SRR5237176']### Paul B - present
Org_df[ Org_df.Sample_Name == 'GAP_028675']### Paul B - present

1455


Unnamed: 0,idPaftol,idSequencing,Sample_Name,Project,order,family,genus,species,SumContigLength,sum_len,...,Assembly,Coverage_Kmer,Coverage_base,maxK,NRepeat_Pattern,NPath,Redo_FastPlast,error_pt,error_nr,SeqID
1376,14611.0,28675,GAP_028675,GAP,Poales,Poaceae,Tripogonella,loliiformis,240504.0,240504.0,...,scaffold,25.6,76.1,85.0,10.0,1000.0,True,,,GAP_028675


In [6]:
if subset is not None:
    Org_df = Org_df[Org_df.Sample_Name.isin(subset)]
    print(Org_df.shape[0],Org_df.Sample_Name.nunique())

344 344


In [7]:
# Load paftol export and merge ena sample IDs
db = pd.read_csv(pft_file)
db = db[db.idPaftol.notnull()].astype({'idPaftol':'int','idSequencing':'int'})
db = db[['Sample_Name','idPaftol', 'DataSource', 'Project', 'Order', 'Family', 'Genus', 'Species', 'Taxonomical_Notes', 
         'ENASampleNum', 'idSequencing', 'ExternalSequenceID', 'ENAExpNumber',
         'ENARunNumber', 'NumReads', 'NumRecoveredGenes', 'SumContigLength','NCBI_TaxID', 'NCBI_sciname']]
#        'title', 'taxId','id', 'alias', 'firstCreated', 'firstPublic', 'releaseStatus', 'secondaryId', 'scientificName', 'commonName']]
### Paul B. - temp alteration to the above lines to process a few samples:
###db = db[['Sample_Name','DataSource','ENASampleNum', 'ENAExpNumber','ENARunNumber','NCBI_TaxID','NCBI_sciname']]
db[:2]

  db = pd.read_csv(pft_file)


Unnamed: 0,Sample_Name,idPaftol,DataSource,Project,Order,Family,Genus,Species,Taxonomical_Notes,ENASampleNum,idSequencing,ExternalSequenceID,ENAExpNumber,ENARunNumber,NumReads,NumRecoveredGenes,SumContigLength,NCBI_TaxID,NCBI_sciname
0,PAFTOL_000564,1,PAFTOL,Asteraceae,Asterales,Asteraceae,Gymnolaena,sp.,Initially recorded as Gymnolaena litoralis but...,,564,,,,,,,,
1,PAFTOL_000961,2,PAFTOL,Pilot,Ranunculales,Circaeasteraceae,Circaeaster,agrestis,,ERS5501508,961,,ERX4839538,ERR5033191,625228.0,332.0,138795.0,39288.0,Circaeaster agrestis


In [8]:
Org_df = pd.merge(Org_df,
         db[['Sample_Name','DataSource','ENASampleNum','ENAExpNumber','ENARunNumber','NCBI_TaxID','NCBI_sciname']],
        how='left',on='Sample_Name').groupby('Sample_Name').head(1)
print(Org_df.isna().sum().to_dict())
print(Org_df.shape[0],Org_df.Sample_Name.nunique())
###Org_df[ Org_df.Sample_Name == 'SRR5237176']### Paul B - present - OK
###Org_df[ Org_df.Sample_Name == 'GAP_026371' ] ### Paul B - present
###Org_df[ Org_df.Sample_Name == 'GAP_028675']### Paul B - present

344 344


In [9]:
geseq_dir = sdir + 'fasta/'
if not os.path.exists(geseq_dir):
    os.makedirs(geseq_dir)

# Whole plastomes
### Annotation

In [10]:
whole_pt = Org_df[(Org_df.Assembly.isin(['circular genome','circular genome with gaps']) & (Org_df.Sum_len_pt>80000))]
print(whole_pt.shape[0],'samples',whole_pt.genus.nunique(),'genera')
whole_pt[:2]

330 samples 304 genera


Unnamed: 0,Sample_Name,idSequencing,order,family,genus,species,SumContigLength,sum_len,sci_name,log_pt,...,Redo_FastPlast,error_pt,error_nr,SeqID,DataSource,ENASampleNum,ENAExpNumber,ENARunNumber,NCBI_TaxID,NCBI_sciname
2,SRR12649607,19711.0,Malvales,Malvaceae,Abutilon,theophrasti,175491.0,175491.0,Abutilon theophrasti,True,...,False,,,SRR12649607,SRA,,SRX9130922,SRR12649607,3631.0,Abutilon theophrasti
3,SRR11342831,19713.0,Pandanales,Velloziaceae,Acanthochlamys,bracteata,7875.0,7875.0,Acanthochlamys bracteata,True,...,False,,,SRR11342831,SRA,,SRX7945768,SRR11342831,145231.0,Acanthochlamys bracteata


In [11]:
# Output fasta files 
for idx, row in whole_pt.iterrows():
    seqs = list(SeqIO.parse(org_dir + DataSource + '/' + 'fasta_pt/' + row.Sample_Name + '_pt.fasta',format='fasta'))
    # Verif only 1 sequence
    if len(seqs) > 1:
        print('error, more than 1 sequence')
        pass
    seq = seqs[0]
    print(row.Sample_Name, row.SeqID, row.family, row.sci_name,len(seq),seq.seq.count('N'))
    seq.id = row.SeqID
    seq.description = ''
    SeqIO.write(seq, geseq_dir + row.SeqID + '-circular.fasta',format='fasta')

SRR12649607 SRR12649607 Malvaceae Abutilon theophrasti 160446 0
SRR11342831 SRR11342831 Velloziaceae Acanthochlamys bracteata 153642 10
ERR4210270 ERR4210270 Brassicaceae Pseudocamelina glaucophylla 153384 77
SRR3478564 SRR3478564 Dioscoreaceae Tacca chantrieri 163007 0
SRR6425650 SRR6425650 Apocynaceae Alafia barteri 155272 0
ERR2789774 ERR2789774 Brassicaceae Ballantinia pumilio 154346 0
SRR1145773 SRR1145773 Rosaceae Bencomia exstipulata 153731 0
SRR10679069 SRR10679069 Lamiaceae Caryopteris trichosphaera 151382 0
SRR6425648 SRR6425648 Apocynaceae Cascabela thevetia 155029 0
SRR13385162 SRR13385162 Poaceae Chimonobambusa sichuanensis 139602 0
SRR2155068 SRR2155068 Asteraceae Conoclinium coelestinum 151335 0
ERR4210279 ERR4210279 Brassicaceae Delpinophytum patagonicum 154932 0
SRR9309785 SRR9309785 Phrymaceae Diplacus longiflorus 153798 0
SRR8666566 SRR8666566 Fabaceae Gastrolobium grandiflorum 152366 10
SRR13089735 SRR13089735 Gentianaceae Gentianopsis paludosa 151569 0
SRR12649932 

SRR5265130 SRR5265130 Lamiaceae Tectona grandis 153928 0
SRR6940074 SRR6940074 Mazaceae Lancea tibetica 154098 0
SRR14626653 SRR14626653 Asteraceae Adenostemma lavenia 150063 0
SRR13700329 SRR13700329 Bromeliaceae Aechmea bromeliifolia 159807 0
ERR5439620 ERR5439620 Poaceae Ampelocalamus scandens 139609 10
ERR5439634 ERR5439634 Poaceae Chimonocalamus pallens 139683 0
SRR7121716 SRR7121716 Asteraceae Cyathocline purpurea 151086 0
SRR11788051 SRR11788051 Apocynaceae Cynanchum wilfordii 161234 0
ERR5439644 ERR5439644 Poaceae Fargesia denudata 139776 10
SRR13364360 SRR13364360 Asteraceae Gymnanthemum amygdalinum 153133 0
SRR14240564 SRR14240564 Lamiaceae Hanceola exserta 153296 0
SRR13985460 SRR13985460 Araliaceae Heptapleurum heptaphyllum 156527 0
SRR14328332 SRR14328332 Lamiaceae Lagopsis supina 151728 0
DRR151838 DRR151838 Meliaceae Lansium domesticum 159567 0
SRR13284716 SRR13284716 Asteraceae Neopallasia pectinata 152938 0
ERR2990308 ERR2990308 Brassicaceae Neslia paniculata 154936 0


SRR7901696 SRR7901696 Crassulaceae Pistorinia breviflora 145822 0
SRR10121871 SRR10121871 Lauraceae Alseodaphnopsis hainanensis 152828 0
SRR10121870 SRR10121870 Lauraceae Alseodaphnopsis petiolaris 154159 0
SRR11342808 SRR11342808 Velloziaceae Barbaceniopsis castillonii 155669 0
SRR5574039 SRR5574039 Annonaceae Klarobelia inundata 158399 0
SRR5574040 SRR5574040 Annonaceae Mosannona costaricensis 159774 0
SRR5574037 SRR5574037 Annonaceae Mosannona discolor 159850 0
SRR5574038 SRR5574038 Annonaceae Onychopetalum periquino 160413 0
SRR5574035 SRR5574035 Annonaceae Oxandra asbeckii 171100 0
SRR5574036 SRR5574036 Annonaceae Oxandra polyantha 171139 0
SRR5574041 SRR5574041 Annonaceae Xylopia peruviana 183356 0
SRR11319310 SRR11319310 Brassicaceae Hirschfeldia incana 152980 0
SRR10669182 SRR10669182 Brassicaceae Camelina sativa 153079 0
ERR2560442 ERR2560442 Brassicaceae Erucastrum elatum 153094 0
ERR2559998 ERR2559998 Brassicaceae Odontarrhena argentea 153749 0
SRR1801304 SRR1801304 Brassica

Submit files to GeSeq https://chlorobox.mpimp-golm.mpg.de/geseq.html and extract zip file in wdir/GeSeq_output

In [12]:
whole_pt.to_csv(sdir + DataSource + '_wholePT.csv',index=False)

# Contigs

In [10]:
references = pd.read_csv('References/NCBI_plastid_ref/NCBI_plastid_ref_list_SELECTED.csv')
print(references.shape)
references.head(2)

(124, 24)


Unnamed: 0,SeqID,Ini_Species,TaxID,mol_type,length,Ini_sci_name,sci_name_query,Similar_match,Duplicates,kew_id,...,sci_name,authors,rank,taxonomic_status,Ini_kew_id,Ini_taxonomic_status,Duplicate_type,Nsp,class,order
0,NC_045294.1,Acorus tatarinowii,123564,genomic DNA,153296,Acorus tatarinowii,Acorus tatarinowii,,False,77188885-1,...,Acorus calamus var. angustatus,Besser,VARIETY,Accepted,84028-1,Synonym,,2,Magnoliopsida,Acorales
1,NC_053912.1,Aponogeton lakhonensis,217757,genomic DNA,154860,Aponogeton lakhonensis,Aponogeton lakhonensis,,False,82954-1,...,Aponogeton lakhonensis,A.Camus,SPECIES,Accepted,,,,62,Magnoliopsida,Alismatales


In [12]:
incomplete_pt = Org_df[(Org_df.Assembly.isin(['circular genome','circular genome with gaps'])==False) &
                   (Org_df.pt_recovered==True)]
print(incomplete_pt.shape[0],'samples',incomplete_pt.genus.nunique(),'genera')
incomplete_pt = incomplete_pt[incomplete_pt.Sum_len_pt<200000].set_index('Sample_Name')
print(incomplete_pt.shape[0],'samples',incomplete_pt.genus.nunique(),'genera')
### Random subset for test 
### Paul B. - changed from random sampling: incomplete_pt = incomplete_pt.sample(n=500)
###incomplete_pt = incomplete_pt.head(500)
print(incomplete_pt.shape[0],'samples',incomplete_pt.genus.nunique(),'genera')
incomplete_pt[:2]
#incomplete_pt[ incomplete_pt.SeqID == 'SRR9611034' ] ### PaulB. - this works
###incomplete_pt[ incomplete_pt.Sample_Name == 'SRR9611034' ] ### PaulB. - but have lost access to Sample_Name - I think it has become the index.
###incomplete_pt.loc['Sample_Name','SeqID']### Accessing the index with a string value fails though I think this is what you need to do.
###Org_df[ Org_df.Sample_Name == 'SRR8666819']### Paul B - still present at this point - OK


327 samples 322 genera
327 samples 322 genera
327 samples 322 genera


Unnamed: 0_level_0,idPaftol,idSequencing,Project,order,family,genus,species,SumContigLength,sum_len,sci_name,...,Redo_FastPlast,error_pt,error_nr,SeqID,DataSource,ENASampleNum,ENAExpNumber,ENARunNumber,NCBI_TaxID,NCBI_sciname
Sample_Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GAP_026365,12049.0,26365,GAP,Poales,Poaceae,Astrebla,lappacea,238665.0,238665.0,Astrebla lappacea,...,True,,,GAP_026365,GAP,ERS9105966,ERX7170297,ERR7599717,66004.0,Astrebla lappacea
GAP_026369,12052.0,26369,GAP,Poales,Poaceae,Phragmites,australis,240276.0,240276.0,Phragmites australis,...,True,,,GAP_026369,GAP,ERS9105985,ERX7170316,ERR7599736,29695.0,Phragmites australis


In [13]:
###references[references.order=='Sapindales']
references[references.order=='Dipsacales']

Unnamed: 0,SeqID,Ini_Species,TaxID,mol_type,length,Ini_sci_name,sci_name_query,Similar_match,Duplicates,kew_id,...,sci_name,authors,rank,taxonomic_status,Ini_kew_id,Ini_taxonomic_status,Duplicate_type,Nsp,class,order
43,NC_054306.1,Nardostachys jatamansi,179860,genomic DNA,155268,Nardostachys jatamansi,Nardostachys jatamansi,,True,859418-1,...,Nardostachys jatamansi,(D.Don) DC.,SPECIES,Accepted,60457704-2,Synonym,Same_Taxon,1,Magnoliopsida,Dipsacales
44,NC_033878.1,Sambucus williamsii,180062,genomic DNA,158305,Sambucus williamsii,Sambucus williamsii,,False,149409-1,...,Sambucus williamsii,Hance,SPECIES,Accepted,,,,25,Magnoliopsida,Dipsacales


In [14]:
###order_dc = {'Gunnerales':'Dilleniales','Picramniales':'Sapindales'}
### Paul B. - added other orders - references.order might not have the order for the current sample (row.order)
### Can test in 'references' above for an existing order  
order_dc = {'Gunnerales':'Dilleniales','Picramniales':'Sapindales', 'Vahliales':'Lamiales', \
'Metteniusales':'Icacinales', 'Escalloniales':'Dipsacales'}
# key=order not in reference set; value=nearest order in species tree 

In [15]:
if not os.path.exists(sdir + '/out_blast/'):
    os.makedirs(sdir + '/out_blast/')
commands = []
for Sample_Name, row in incomplete_pt.iterrows():
    print(Sample_Name,row.order,row.family)
    ### Identify best reference
    # check if reference in same family
    best_ref = references[references.family==row.family]
    if best_ref.shape[0]<1:
        # otherwise order
        best_ref = references[references.order==row.order]
        if best_ref.shape[0]<1:
            #Otherwise use closest order
            ### Paul B. 'Vahliales' doesn't exist in key row.order so adding if conditional
            if order_dc[row.order]:
                best_ref = references[references.order==order_dc[row.order]]
            else: 
                print('WARNING: order does not exist for Sample_Name')
    print('\t Ref',best_ref.shape, end = ': ')
    best_ref = best_ref.iloc[0]
    print(best_ref.SeqID,best_ref.order,best_ref.family)
    incomplete_pt.loc[Sample_Name,'Ref-SeqID'] = best_ref.SeqID
    incomplete_pt.loc[Sample_Name,'Ref-order'] = best_ref.order
    incomplete_pt.loc[Sample_Name,'Ref-family'] = best_ref.family
    
    ### launch blastn
    # program_path = 'C:/Program Files/NCBI/blast-BLAST_VERSION+/bin/blastn.exe'
    program_path = 'blastn'
    ref_path = 'References/NCBI_plastid_ref/' + best_ref['SeqID'] + '.fasta'
    ### Paul B. contigs_path = '../Organelles/' + DataSource + '/fasta_pt/' + Sample_Name + '_pt.fasta'
    contigs_path = org_dir + DataSource + '/fasta_pt/' + Sample_Name + '_pt.fasta'
    out_path = 'ENA_submissions/' + DataSource + '/out_blast/'+ Sample_Name + '.blast'
    command = program_path + ' -query ' + contigs_path + ' -subject ' + ref_path + \
                ' -outfmt "6 qseqid sseqid pident length slen qlen mismatch gapopen qstart qend sstart send evalue bitscore"' + \
                ' -out ' + out_path
    commands.append(command)
    os.system(command)
    
    ### read blast output
    blast_output = pd.read_table(out_path, sep='\t',header=None)
    blast_output.columns = ['qseqid','sseqid','pident','length','slen','qlen','mismatch','gapopen','qstart','qend','sstart','send','evalue','bitscore']
#     print('\t',blast_output.shape[0],'matched for', blast_output.qseqid.nunique(),'qseqid',end='; ')
    incomplete_pt.loc[Sample_Name,'Blast-N_qseqid_match'] = blast_output.qseqid.nunique()
    # Keep best match (by bitscore) per queried contig
    blast_output = blast_output.sort_values('bitscore',ascending=False).groupby('qseqid').head(1)
    blast_output['oriented_sstart'] = blast_output[['sstart','send']].min(axis=1); blast_output['oriented_ssend'] = blast_output[['sstart','send']].max(axis=1)
    blast_output['Need_RevComp'] = blast_output.sstart > blast_output.oriented_sstart
    blast_output = blast_output.sort_values('oriented_sstart').set_index('qseqid')
#     print('Contigs to reverse complement:',blast_output.Need_RevComp.value_counts().to_dict())
    
    
    ### Load contigs and write concatenated fasta
    gapN = 100
    records = SeqIO.to_dict(SeqIO.parse(contigs_path, "fasta"))
    print('\t',blast_output.shape[0],'/',len(records),'contigs mapped to reference')
    ### Paul B - Q: what happens if contig can't be placed in the order due to no blast hit?
    concat_seq = ''
    for qseqid, blast_row in blast_output.iterrows():
        if blast_row.Need_RevComp==True:
            concat_seq += records[qseqid].seq.reverse_complement()
        else:
            concat_seq += records[qseqid].seq 
        concat_seq += Seq(gapN * 'N')
    concat_seq = concat_seq[:-gapN] # Remove trailing Ns
    print('\t sum len concat:',blast_output.qlen.sum(),'/',int(row.Sum_len_pt),round(blast_output.qlen.sum()/int(row.Sum_len_pt)*100,1),'%')
    incomplete_pt.loc[Sample_Name,'Concat-len'] = blast_output.qlen.sum()
    incomplete_pt.loc[Sample_Name,'Concat-%_mapped'] = round(blast_output.qlen.sum()/int(row.Sum_len_pt)*100,1)
    incomplete_pt.loc[Sample_Name,'Concat-len_with_Ns'] = len(concat_seq)
    Concat_record = SeqRecord(
        concat_seq,
        id=row.SeqID + '_concat_pt',
        name=Sample_Name + '_concat_pt',
        description=''
    )
    SeqIO.write(Concat_record,'ENA_submissions/' + DataSource + '/fasta/' + row.SeqID + '-concat.fasta','fasta')

GAP_026365 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 14 / 14 contigs mapped to reference
	 sum len concat: 109428 / 109428 100.0 %
GAP_026369 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 9 / 9 contigs mapped to reference
	 sum len concat: 116810 / 116810 100.0 %
GAP_026371 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 8 / 8 contigs mapped to reference
	 sum len concat: 216528 / 216528 100.0 %
GAP_026381 Boraginales Boraginaceae
	 Ref (1, 24): NC_053780.1 Boraginales Boraginaceae
	 32 / 32 contigs mapped to reference
	 sum len concat: 90101 / 90101 100.0 %
GAP_026383 Brassicales Brassicaceae
	 Ref (1, 24): NC_049578.1 Brassicales Brassicaceae
	 2 / 2 contigs mapped to reference
	 sum len concat: 155050 / 155050 100.0 %
GAP_026385 Boraginales Boraginaceae
	 Ref (1, 24): NC_053780.1 Boraginales Boraginaceae
	 10 / 10 contigs mapped to reference
	 sum len concat: 143733 / 143733 100.0 %
GAP_026391 Brassicales Brassicaceae
	 Ref (1, 24): NC_049578

	 4 / 4 contigs mapped to reference
	 sum len concat: 128090 / 128090 100.0 %
GAP_026461 Caryophyllales Montiaceae
	 Ref (2, 24): NC_046575.1 Caryophyllales Amaranthaceae
	 48 / 49 contigs mapped to reference
	 sum len concat: 78649 / 79071 99.5 %
GAP_026463 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 34 / 34 contigs mapped to reference
	 sum len concat: 40930 / 40930 100.0 %
GAP_026467 Solanales Convolvulaceae
	 Ref (1, 24): NC_042940.1 Solanales Convolvulaceae
	 46 / 46 contigs mapped to reference
	 sum len concat: 31228 / 31228 100.0 %
GAP_026469 Brassicales Gyrostemonaceae
	 Ref (2, 24): NC_049578.1 Brassicales Brassicaceae
	 24 / 24 contigs mapped to reference
	 sum len concat: 34959 / 34959 100.0 %
GAP_026475 Gentianales Gentianaceae
	 Ref (1, 24): NC_054359.1 Gentianales Gentianaceae
	 39 / 39 contigs mapped to reference
	 sum len concat: 74107 / 74107 100.0 %
GAP_022217 Gentianales Rubiaceae
	 Ref (2, 24): NC_054359.1 Gentianales Gentianaceae
	 27 / 29 contigs ma

	 5 / 5 contigs mapped to reference
	 sum len concat: 118503 / 118503 100.0 %
GAP_026653 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 21 / 21 contigs mapped to reference
	 sum len concat: 108615 / 108615 100.0 %
GAP_026655 Lamiales Phrymaceae
	 Ref (2, 24): NC_053620.1 Lamiales Lamiaceae
	 41 / 42 contigs mapped to reference
	 sum len concat: 38631 / 38899 99.3 %
GAP_026669 Caryophyllales Caryophyllaceae
	 Ref (2, 24): NC_046575.1 Caryophyllales Amaranthaceae
	 3 / 3 contigs mapped to reference
	 sum len concat: 146645 / 146645 100.0 %
GAP_026671 Boraginales Boraginaceae
	 Ref (1, 24): NC_053780.1 Boraginales Boraginaceae
	 42 / 42 contigs mapped to reference
	 sum len concat: 93693 / 93693 100.0 %
GAP_026673 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 1 / 1 contigs mapped to reference
	 sum len concat: 23720 / 23720 100.0 %
GAP_026681 Poales Restionaceae
	 Ref (2, 24): NC_042211.1 Poales Eriocaulaceae
	 31 / 40 contigs mapped to reference
	 sum len concat:

	 2 / 2 contigs mapped to reference
	 sum len concat: 129417 / 129417 100.0 %
GAP_026823 Fabales Fabaceae
	 Ref (1, 24): NC_025745.1 Fabales Fabaceae
	 13 / 13 contigs mapped to reference
	 sum len concat: 132519 / 132519 100.0 %
GAP_026829 Proteales Proteaceae
	 Ref (1, 24): NC_036416.1 Proteales Proteaceae
	 10 / 11 contigs mapped to reference
	 sum len concat: 51819 / 52529 98.6 %
GAP_026833 Proteales Proteaceae
	 Ref (1, 24): NC_036416.1 Proteales Proteaceae
	 8 / 9 contigs mapped to reference
	 sum len concat: 26076 / 26969 96.7 %
GAP_026835 Proteales Proteaceae
	 Ref (1, 24): NC_036416.1 Proteales Proteaceae
	 5 / 7 contigs mapped to reference
	 sum len concat: 32614 / 34171 95.4 %
GAP_026845 Proteales Proteaceae
	 Ref (1, 24): NC_036416.1 Proteales Proteaceae
	 24 / 26 contigs mapped to reference
	 sum len concat: 45195 / 46627 96.9 %
GAP_026847 Proteales Proteaceae
	 Ref (1, 24): NC_036416.1 Proteales Proteaceae
	 14 / 16 contigs mapped to reference
	 sum len concat: 130575 / 1

	 25 / 25 contigs mapped to reference
	 sum len concat: 39561 / 39561 100.0 %
GAP_027739 Asparagales Orchidaceae
	 Ref (1, 24): NC_045400.1 Asparagales Orchidaceae
	 39 / 39 contigs mapped to reference
	 sum len concat: 95933 / 95933 100.0 %
GAP_027761 Asparagales Orchidaceae
	 Ref (1, 24): NC_045400.1 Asparagales Orchidaceae
	 15 / 15 contigs mapped to reference
	 sum len concat: 34951 / 34951 100.0 %
GAP_027767 Asparagales Orchidaceae
	 Ref (1, 24): NC_045400.1 Asparagales Orchidaceae
	 35 / 35 contigs mapped to reference
	 sum len concat: 92520 / 92520 100.0 %
GAP_027777 Asparagales Orchidaceae
	 Ref (1, 24): NC_045400.1 Asparagales Orchidaceae
	 39 / 39 contigs mapped to reference
	 sum len concat: 37595 / 37595 100.0 %
GAP_027781 Asparagales Orchidaceae
	 Ref (1, 24): NC_045400.1 Asparagales Orchidaceae
	 27 / 27 contigs mapped to reference
	 sum len concat: 31308 / 31308 100.0 %
GAP_027783 Asparagales Orchidaceae
	 Ref (1, 24): NC_045400.1 Asparagales Orchidaceae
	 11 / 12 contig

	 3 / 3 contigs mapped to reference
	 sum len concat: 155732 / 155732 100.0 %
GAP_028175 Malvales Malvaceae
	 Ref (1, 24): NC_038057.1 Malvales Malvaceae
	 17 / 17 contigs mapped to reference
	 sum len concat: 129866 / 129866 100.0 %
GAP_028177 Malvales Malvaceae
	 Ref (1, 24): NC_038057.1 Malvales Malvaceae
	 11 / 11 contigs mapped to reference
	 sum len concat: 33803 / 33803 100.0 %
GAP_028179 Malvales Malvaceae
	 Ref (1, 24): NC_038057.1 Malvales Malvaceae
	 55 / 55 contigs mapped to reference
	 sum len concat: 49566 / 49566 100.0 %
GAP_028181 Malvales Malvaceae
	 Ref (1, 24): NC_038057.1 Malvales Malvaceae
	 51 / 51 contigs mapped to reference
	 sum len concat: 84695 / 84695 100.0 %
GAP_028187 Santalales Loranthaceae
	 Ref (1, 24): NC_040862.1 Santalales Loranthaceae
	 29 / 29 contigs mapped to reference
	 sum len concat: 84106 / 84106 100.0 %
GAP_028189 Apiales Araliaceae
	 Ref (1, 24): NC_022812.1 Apiales Araliaceae
	 5 / 5 contigs mapped to reference
	 sum len concat: 132367 / 1

	 9 / 9 contigs mapped to reference
	 sum len concat: 114731 / 114731 100.0 %
GAP_028691 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 42 / 42 contigs mapped to reference
	 sum len concat: 33418 / 33418 100.0 %
GAP_028695 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 13 / 13 contigs mapped to reference
	 sum len concat: 79108 / 79108 100.0 %
GAP_028697 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 20 / 20 contigs mapped to reference
	 sum len concat: 26494 / 26494 100.0 %
GAP_028599 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 8 / 8 contigs mapped to reference
	 sum len concat: 29869 / 29869 100.0 %
GAP_028605 Poales Poaceae
	 Ref (1, 24): NC_035050.1 Poales Poaceae
	 8 / 8 contigs mapped to reference
	 sum len concat: 33139 / 33139 100.0 %
GAP_028611 Asparagales Boryaceae
	 Ref (2, 24): NC_053784.1 Asparagales Asparagaceae
	 29 / 29 contigs mapped to reference
	 sum len concat: 116549 / 116549 100.0 %
GAP_029179 Asparagales Orchidac

* The above files are written to the /fasta/ directory.
* Submit files for the concatenated/partial plastids to GeSeq https://chlorobox.mpimp-golm.mpg.de/geseq.html
    * If only wanting to select a subset of the data, can just limit the number of files uploaded. The rest of the pipeline will just process those samples 
* Extract zip file into the /geseq/ folder
* In code below, geseq Genbank files are modified and written to /Genbank_files folder
* Then those modifed Genbank files are converted to EMBL format and written to the /EMBL_files folder 


In [16]:
incomplete_pt.to_csv(sdir + DataSource + '_concatPT_test.csv')

### ENA submission
https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html?highlight=chloroplast#



In [6]:
paper_title = 'Building the largest, curated plastid genome dataset and phylogeny by leveraging nuclear-oriented sequence data'
paper_authors = 'Leempoel K., Zuntini A. R., Bailey P., Eiserhardt W. L., Gallego B., Maurin O., Allnutt T., Bellot S., Clarkson J. J., ' + \
        'Clements M., Cowan R. S., Crayn D. D., de Lírio E., Françoso E., Höwener A., Hu A., Joyce E., Larson D., Li H., Li D., Liu J., Lum M., McGinnie C., ' + \
        'McLay T., Nargar K., Perez Escobar O., Przelomska N., Roy S., Schmidt-Lebuhn A., Shah T., Simpson L., Yang J., Yi T., Forest F., Kersey P. J., ' + \
        'Leitch I. J., Baker W. J.'
paper_journal = 'Unpublished'
paper_id = 'NA'
paper_authors

'Leempoel K., Zuntini A. R., Bailey P., Eiserhardt W. L., Gallego B., Maurin O., Allnutt T., Bellot S., Clarkson J. J., Clements M., Cowan R. S., Crayn D. D., de Lírio E., Françoso E., Höwener A., Hu A., Joyce E., Larson D., Li H., Li D., Liu J., Lum M., McGinnie C., McLay T., Nargar K., Perez Escobar O., Przelomska N., Roy S., Schmidt-Lebuhn A., Shah T., Simpson L., Yang J., Yi T., Forest F., Kersey P. J., Leitch I. J., Baker W. J.'

In [7]:
### Paul B. - I think from this point, only cells 1 - 4 are necessary to run the cells below "ENA submission" 
### title. In this cell you need to alter these variables to process either whole plastomes OR partial plastomes
### - I don't think you can do both together
### ALSO on repeating run you MUST run both main cells again not just the final cell that creates the EMBL file, 
### otherwise the assembly_gap features end up being put on the record twice!!!
### Also, still need the /geseq/ dir.

### Paul B. - actually I think this line concatenates both circular and concat. samples so they could be
### processed together
#submit_pt = pd.concat([pd.read_csv(sdir + DataSource + '_wholePT.csv'), pd.read_csv(sdir + DataSource + '_concatPT.csv')],ignore_index=True)
### 12.7.2023 - Trying this now:
#submit_pt = pd.concat([pd.read_csv(sdir + DataSource + '_wholePT.29_still_to_do.csv'), pd.read_csv(sdir + DataSource + '_concatPT.68_still_to_do.csv')],ignore_index=True)
### NB - it is possible but would need to combine the geseq* dirs together so whole and concat annos were in same dir. 
submit_pt = pd.read_csv(sdir + DataSource + '_concatPT.31_still_to_do.csv')
#submit_pt = pd.read_csv(sdir + DataSource + '_concatPT.csv')
print(submit_pt.shape[0])
submit_pt = submit_pt[(submit_pt.ENASampleNum.notnull()) & (submit_pt.ENARunNumber.notnull())]
print(submit_pt.shape[0])
submit_pt = submit_pt[ submit_pt.NCBI_sciname.notnull()]
print(submit_pt.shape[0], submit_pt.Assembly.value_counts().to_dict())
submit_pt = submit_pt[submit_pt.NCBI_TaxID.notnull()]
print(submit_pt.shape[0])
submit_pt.head(2)

# In a separate run, selecting samples that have no NCBI_sciname or NCBI_TaxID:
# (NB - comment out the lines below to process the main data set above)
#submit_pt = pd.read_csv(sdir + DataSource + '_concatPT.csv')
#submit_pt = submit_pt[submit_pt.NCBI_sciname.isnull() | submit_pt.NCBI_TaxID.isnull()]
#print(submit_pt.shape[0])
#submit_pt.to_csv(sdir + DataSource + '_concatPT_no_sciname_or_taxid.csv')
# Now alter the file to add NCBI_TaxID=2738895 and NCBI_sciname=spermatophyte_root to each row,
# then upload again into submit_pt:
#submit_pt = pd.read_csv(sdir + DataSource + '_concatPT_sciname_spermatophyte_root_taxid_2738895.csv')
# Also have 55 samples that failed validation due to no taxon_id in the ENA accession from the main batch:
#submit_pt = pd.read_csv(sdir + DataSource + '_concatPT_sciname_spermatophyte_root_taxid_2738895_plus_55_samples.csv')
#print(submit_pt.shape[0])

31
31
31 {'scaffold': 31}
31


Unnamed: 0,Sample_Name,idSequencing,order,family,genus,species,SumContigLength,sum_len,sci_name,log_pt,...,Ref-family,Blast-N_qseqid_match,Concat-len,Concat-%_mapped,Concat-len_with_Ns,ENASampleNum,ENAExpNumber,ENARunNumber,NCBI_TaxID,NCBI_sciname
0,ERR4180132,21989.0,Myrtales,Penaeaceae,Penaea,petraea,126591.0,126591.0,Penaea petraea,True,...,,,,,,ERS4591104,ERX4143556,ERR4180132,186937,Sonderothamnus petraeus
1,ERR420512,23727.0,Fabales,Fabaceae,Senegalia,senegal,134874.0,134874.0,Senegalia senegal,True,...,,,,,,ERS399685,ERX386805,ERR420512,138043,Senegalia senegal


In [8]:
### submit_pt = submit_pt[submit_pt.ENARunNumber=='ERR7619260']
###submit_pt = submit_pt[ submit_pt.ENARunNumber=='ERR5010320' ]
len(submit_pt)
submit_pt.columns
submit_pt[ ['SeqID','NCBI_TaxID', 'NCBI_sciname'] ]

Unnamed: 0,SeqID,NCBI_TaxID,NCBI_sciname
0,ERR4180132,186937,Sonderothamnus petraeus
1,ERR420512,138043,Senegalia senegal
2,ERR420513,162639,Acrocarpus fraxinifolius
3,ERR420518,54878,Enterolobium cyclocarpum
4,ERR420521,185774,Jacaranda mimosifolia
5,ERR4604253,2065072,Ruilopezia viridis
6,SRR11342819,2707190,Barbacenia involucrata
7,SRR12664414,2746967,Cryptocoryne hudoroi
8,SRR13390744,2778903,Tolumnia bahamensis
9,SRR14294427,134525,Triosteum himalayanum


In [9]:
all_seq = pd.read_csv('cpTree_v7/AllSamples_Allgenes_All_SeqTable.csv')
print(all_seq.shape)
genes_82 = pd.read_table('cp_genes_82.txt',header=None).rename(columns={0:'gene'})
all_seq = all_seq[all_seq.gene.isin(genes_82.gene)]
print(all_seq.shape)
all_seq.head(2)

(1678470, 4)
(1172782, 4)


Unnamed: 0,seqid,len,Sample_Name,gene
0,GAP_026547-rrn23,2812,GAP_026547,rrn23
1,GAP_026547-rrn16,1491,GAP_026547,rrn16


In [10]:
def list_genes(record, ls_types = ['gene','CDS','rRNA']):
    # list gene names
    feat_ls = []; idx = 0
    for feature in record.features:
        if feature.type in ls_types:
            feat_ls.append(feature.qualifiers['gene'][0])
        idx += 1
    return list(set(feat_ls))

In [14]:
### Paul B. - modified this cell to add a dictionary to assess samples not in gb_files.
### NB - check /geseq/ first - I've turned it into a symlink that points either to whole or concat geseq directories  


# Rename and modify genbank files in this cell (then convert to embl flat file in downstream cell)
## Filename example: GeSeqJob-20220601-70838_Pis_010569_concat_pt_GenBank.gb
if not os.path.exists(sdir + '/Genbank_files/'):
    os.makedirs(sdir + '/Genbank_files/')
gb_files = [file for file in os.listdir(sdir + 'geseq/') if file.endswith('.gb')]

gbFilesDict = {} # Records each file in a dictionary key so that a list can be made of samples not in gb_files         
for file in os.listdir(sdir + 'geseq/'):
    if file.endswith('.gb'):
        fields = file.split('_')
        ### Paul B. - added conditional to get for SRA data where the SeqID comprises only the first field
        if DataSource == 'SRA':
            key = fields[1]
        else:
            key = fields[1] + '_' + fields[2]
        gbFilesDict[key] = file
        ###print(gbFilesDict[key])
gbFilesAbsent = [] # Records the Genbank files absent from GeSeq analysis - trying to re-run these in GeSeq
print(len(gb_files))
for idx, row in submit_pt.iterrows():
    #print(idx, ' ', row.SeqID)
    #indexStored = '' ### Paul B. altered code here so as not to use list comprehension - wasn't working for me
    ### Paul B. - removed loop to replace with key search: for index, file in enumerate(gb_files):
    if row.SeqID in gbFilesDict:
        ###print('Outer: ', idx, ' ', row.SeqID, '; inner: ', index, ' ', file)
        ###if row.SeqID in file:
        #print('Outer: ', idx, ' ', row.SeqID, '; inner: ', index, ' ', gbFilesDict[row.SeqID])
            ###print(index, row.SeqID)
        #indexStored = int(index)
        ###print(row.Sample_Name,gbFilesDict[row.SeqID])
    ### Original  code:
    ### idx_file = [i for i, x in enumerate([row.SeqID in file for file in gb_files]) if x]
    ### print(row.Sample_Name,gb_files[index], end=':')
    ### records = list(SeqIO.parse(sdir + 'geseq/' + gb_files[idx_file[0]], "genbank"))
        records = list(SeqIO.parse(sdir + 'geseq/' + gbFilesDict[row.SeqID], "genbank"))
        ### Paul B. - I don't think the next 3 lines are doing anything other than be involved in the print 
        record = records[0]
        record_genes = list_genes(record); record_genes = [gene for gene in record_genes if gene in genes_82.gene.to_list()]
        blast_genes = all_seq[all_seq.Sample_Name==row.Sample_Name]
        #print(len(records), len(records[0]), len(record_genes),'/',blast_genes.gene.nunique(),
        #    int(len(record_genes)/blast_genes.gene.nunique()*100),'%',blast_genes[blast_genes.gene.isin(record_genes)==False].gene.unique())
        try:
            if row.Assembly in ['circular genome','circular genome with gaps']:
                ### 23.11.2022 - not submitting SRA or GAP data via TPA now
                ###if DataSource == 'SRA' or DataSource == 'GAP':
                ###    record.description = 'TPA_asm: ' + row.ENARunNumber + ' chloroplast, complete genome'
                ###else:
                record.description = row.ENARunNumber + ' chloroplast, complete genome'  
            elif row.Assembly in ['scaffold']:
                ###if DataSource == 'SRA' or DataSource == 'GAP':
                ###    record.description = 'TPA_asm: ' + row.ENARunNumber + ' chloroplast, partial genome (concatenated contigs)'
                ###else:
                record.description = row.ENARunNumber + ' chloroplast, partial genome (concatenated contigs)'
                record.annotations['topology'] = 'linear'
            ### Paul B. - pushed back indentation level to also print circular samples
            submit_pt.loc[idx,'ENA_SeqID'] = row.ENARunNumber + '_pt'
            record.id = submit_pt.loc[idx,'ENA_SeqID']
            record.name = submit_pt.loc[idx,'ENA_SeqID']
            record.version = submit_pt.loc[idx,'ENA_SeqID'] + '.1'
            record.annotations['source'] = row.ENASampleNum + ', ' + row.sci_name + ', isolate'
            record.annotations['accessions'] = row.ENASampleNum
            record.annotations['organism'] = row.NCBI_sciname
            record.annotations["data_file_division"]="PLN";
            record.annotations.pop('taxonomy')
            record.annotations['references'][0].title = paper_title 
            record.annotations['references'][0].authors = paper_authors    
            record.annotations['references'][0].journal = paper_journal      
            record.annotations['references'][0].pubmed_id = paper_id    
            record.features[0].qualifiers['organism'] = row.NCBI_sciname
            record.features[0].qualifiers['db_xref'] = ['taxon:' + str(int(row['NCBI_TaxID']))]
            #print(record)
            SeqIO.write(record,sdir + 'Genbank_files/' + row.SeqID + '_pt.gb',format='genbank')
            
        except:
            ### Paul B. - changed: print('issue with ',gb_files[idx_file[0]])
            print('issue with ',gbFilesDict[row.SeqID])
    else:
        print('WARNING: no geseq file in submit_pt table for: ', row.SeqID)
        gbFilesAbsent.append(row.SeqID)
print('\n\nList of missing Genbank files that need to be run through GeSeq again:')
for sample in gbFilesAbsent: 
        print(sample)

1281


List of missing Genbank files that need to be run through GeSeq again:


In [16]:
def del_features(record, feat_del_ls = ['translation','info','annotator']):
    for feature in record.features:
        for feat_to_del in feat_del_ls:
            if feat_to_del in feature.qualifiers:
                del feature.qualifiers[feat_to_del]  ### Paul B - NOT removing from the record object - don't understand but seems to work!
    return record

def del_fragment(record, ls_types = ['gene','CDS','rRNA','intron']): ### Paul B.: added introns there are some intron fragments when BLAST and Chloe are used together (only for rps12 though); tRNA appears not to be an issue
    # list gene names
    ls_feat = []
    for feature in record.features:
        if feature.type in ls_types:
            ls_feat.append(feature.qualifiers['gene'][0])
    ls_feat = [ifeat for ifeat in ls_feat if 'fragment' not in ifeat]

    # remove fragment feature if full feature is present
    rm_count = 0
    for feature in record.features:
        if feature.type in ls_types:
            if 'fragment' in feature.qualifiers['gene'][0]:
                if feature.qualifiers['gene'][0].split('-')[0] in ls_feat:
                    rm_count += 1
                    record.features.remove(feature)   ### Paul B - removing from the record object - OK
    print('removed',rm_count,'partial features')
    return record

def del_longfeat(record, ls_types = ['gene','CDS','rRNA'], len_thrs = 5000):  ### Paul B. - longest plastid gene I found was 6,791 in sample 000259 
    for feature in record.features:
        if feature.type in ls_types:
            if len(feature)>len_thrs:
                print('removing long feature ',feature.qualifiers['gene'],len(feature))
                record.features.remove(feature)      ### Paul B - removing from the record object - OK
    return record

def add_gap_features(record, gapN = 100):
    # find gaps in record
    gaps_idx = [m.start() for m in re.finditer(gapN * 'N', str(record.seq))]
    print(len(gaps_idx),'gaps')
    if len(gaps_idx)>0:
        ## for each gap, add a sequence feature
        for gap_pos in gaps_idx:
            # find index of gap in features
            features_start = [feat.location.start for feat in record.features]
            gap_feature_idx = [i for i in range(len(features_start)) if features_start[i] > gap_pos]
            
            gap_feat = SeqFeature(FeatureLocation(gap_pos,gap_pos+gapN), strand=1, type='assembly_gap')
            gap_feat.qualifiers['gap_type']='within scaffold'; 
            gap_feat.qualifiers['estimated_length']='unknown'
            gap_feat.qualifiers['linkage_evidence']='unspecified'
            if len(gap_feature_idx)>0:
                gap_feature_idx = gap_feature_idx[0]
                record.features.insert(gap_feature_idx, gap_feat)
            elif len(gap_feature_idx)==0: #if position is further than any other feature, append at the end of feature list
                record.features.append(gap_feat)
    return record

# https://ena-docs.readthedocs.io/en/latest/faq/locus_tags.html
def add_locus_tag(record, locus_tag, ls_types = ['gene','CDS','rRNA','tRNA','intron','exon']):
    locus_tags_genes = {}; idx = 1
    for feature in record.features:
        if feature.type in ls_types:
            if feature.qualifiers['gene'][0] not in locus_tags_genes:
                locus_tags_genes[feature.qualifiers['gene'][0]] = locus_tag + '_' + str(idx)
                idx += 1
            feature.qualifiers['locus_tag']=locus_tags_genes[feature.qualifiers['gene'][0]]
            ### Paul B. Don't understand how changing contents of feature can change the
            ### contents of record but it does seem to.
    return record

def add_locus_tag_v1(record, locus_tag, ls_types = ['gene','CDS','rRNA','tRNA','intron','exon']):
    ''' Paul B. - extended the existing add_locus_tag() function so locus tags could be 
        assigned to gene features along the genome rather than per gene name.
        
        NBNB - NOT removing from the record object - don't understand but seems to work!
    '''
    locus_tags_genes = {}; idx = 0 # I think it's incorrect logic to track genes, need to create a new locus tag ID for EACH gene feature
    geneName = ''  # Holds a gene name until all connected gene features have been seen
    idxErr = 1000
    geneNameErr = '' # Holds a gene name where a feature is defined before the 'gene' feature until 
                     # all connected gene features have been seen 
    print('Number of feats', len(record.features))
    for feature in record.features:   # Returns an array of features - therefore can assume order is as along the genome
        if feature.type == 'gene': # Removed these features, they are both associated with a gene 
                                   # feature:  or feature.type == 'rRNA' or feature.type == 'tRNA':
            idx += 1 # locus tag counter
            #print('New gene, rRNA or tRNA: index ', idx, feature.qualifiers['gene'][0]) # NB - this qualifier key value is an array!
            #locus_tags_genes[feature.qualifiers['gene'][0]] = locus_tag + '_' + str(idx)
            locus_tags_genes[feature.qualifiers['gene'][0]] = str(idx)
            geneName = feature.qualifiers['gene'][0] # NB - the gene name does appear to be house in an array - weird, unless it enables you to have multiple names
        if feature.type in ls_types:
            if feature.qualifiers['gene'][0] != geneName:
                # Need to check whether next feature belongs to the current gene name.
                # There are examples where this is not the case!
                # Q1: could you ever have a feature of gene before it's defined? - yes see below 
                print('WARNING: current feature NOT part of current gene!!!', feature.qualifiers['gene'][0])
                # In which case, try to use the locus tag of the correct gene if it already exists in 
                # an earlier part of the genome, hopefully closeby:
                if feature.qualifiers['gene'][0] in locus_tags_genes:
                    # Removed the 'warn' for final run:  feature.qualifiers['locus_tag'] = locus_tag + '_warn' + locus_tags_genes[feature.qualifiers['gene'][0]]
                    feature.qualifiers['locus_tag'] = locus_tag + '_' + locus_tags_genes[feature.qualifiers['gene'][0]]
                else:
                    print("ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!")
                    # Need to force create a new locus tag for gene that doesn't exist (yet)
                    if feature.qualifiers['gene'][0] != geneNameErr:
                        idxErr += 1 
                        # Removed the '_1err' for final run:  feature.qualifiers['locus_tag'] = locus_tag + '_1err' + str(idxErr)
                        feature.qualifiers['locus_tag'] = locus_tag + '_' + str(idxErr)
                        geneNameErr = feature.qualifiers['gene'][0]
                        # idxErr will get incemented every time a new gene is hit but this 
                        # shouldn't be very likely and will cease as soon as a gene feat is found.
                    else:
                        # Removed the '_2err' for final run: feature.qualifiers['locus_tag'] = locus_tag + '_2err' + str(idxErr)  # locus_tags_genes[feature.qualifiers['gene'][0]] will not exist here yet! 
                        feature.qualifiers['locus_tag'] = locus_tag + '_' + str(idxErr)
                    ### Other ideas:
                    ### Could also assess overlap in coordinates to confirm the location of each gene feature
                    ### but this is more complex - look at my set analysis - could add to locus_tags_genes if feature has nesting coords
                    ### locus_tags_genes {gene}{coordSet and locusTagID}
                    ### Not sure what the best key would be as I've seen a gene with no 'gene' feature!
            else:
                feature.qualifiers['locus_tag'] = locus_tag + '_' + str(idx)
    return record

def gzip_file(in_file):
    import gzip
    import shutil
    f_gz = in_file + '.gz'
    with open(in_file, 'rb') as f_in:
        with gzip.open(f_gz, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

In [17]:
# Now ready to modify ENA files and delete certain features.
### Paul B - also the /Genbank_files/ are being modified again here and printed to same file, 
### so for troubleshooting this cell will need to re-run the above cell that produces the 
### /Genbank_files/ for the erste time 
if not os.path.exists(sdir + '/EMBL_files/'):
    os.makedirs(sdir + '/EMBL_files/')
commands = []
commands.append('#!/bin/bash') ### Paul B. added shebang line 
for idx, row in submit_pt.iterrows():  ### Paul B. - NB - if not processing all fasta files, the except clause will be triggered - OK 
    ### Convert .gb to EMBL flat files
    file_gb = sdir + 'Genbank_files/' + row.SeqID + '_pt.gb'
    print(row.SeqID,row.ENA_SeqID,end=' > ')
    try:
        record = list(SeqIO.parse(file_gb, "genbank"))[0]
        print(len(record))

        # Deleted features preventing ena submission
        record_clean = del_features(record = record, feat_del_ls = ['translation','info','annotator'])
        # Delete fragmentary annotations (e.g. rr16-fragment)
        record_clean = del_fragment(record = record_clean); ### Paul B.: removed this duplicate call to del_fragment(): record_clean = del_fragment(record = record_clean);
        # Delete abnormaly long genes
        record_clean = del_longfeat(record = record_clean,len_thrs=10000);
        # Add gap features for partial plastomes. Recognises gaps of 100 Ns by default
        record_clean = add_gap_features(record = record_clean)
        # Add locus tag if sequence has gaps   ### Paul B. - why do you need a locus tag for gaps, they aren't genes?
        ### Paul B. removed: if 'assembly_gap' in [feature.type for feature in record.features]:
            ### Paul B. arhh I see, it's just testing in general whether any feature type
            ### is of the concat type, then add_locus_tag() is triggered to apply a locus_tag ID
            ### to each gene locus - OK - however as it is, the circular sequences would not have a
            ### locus tag applied - so I think the conditional can be removed so function is applied to
            ### both concat and circular genomes - done 
            ### NB - I think it could also be used in line above to test whether gap features have already been added!
        record_clean = add_locus_tag_v1(record = record_clean, locus_tag = project_locus_tag)

        file_embl = sdir + '/EMBL_files/' + row.ENA_SeqID + '.embl'
        SeqIO.write(record_clean, file_gb, "genbank")
        ### Paul B. NBNB - the /Genbank_files/ are also being modified here 
        ### and overwritten to same file!! again! Didn't notice this line at first
        ###exit()   ### Paul B.
        SeqIO.write(record_clean, file_embl, "embl")
        gzip_file(file_embl)

        ### Chromosome list file
        # https://ena-docs.readthedocs.io/en/latest/submit/fileprep/assembly.html?highlight=chloroplast#chromosome-list-file
        if row.Assembly in ['circular genome','circular genome with gaps']:
            list_file=row.ENA_SeqID + '	PT	Circular-Chromosome	Chloroplast'       
        elif row.Assembly in ['scaffold']:
            list_file=row.ENA_SeqID + '	PT	Linear-Chromosome	Chloroplast'
        with open(file_embl.replace('.embl','_CHR.txt'), 'wb') as f:
            f.write(list_file.encode("ascii"))
        gzip_file(file_embl.replace('.embl','_CHR.txt'))

        ### Manifest file 
        # https://ena-docs.readthedocs.io/en/latest/submit/assembly/genome.html 
        manifest_file = file_embl.replace('.embl','_manifest.txt')
        if row.Assembly in ['circular genome','circular genome with gaps']:
            tmp_description=' chloroplast, complete genome' 
        elif row.Assembly in ['scaffold']:
            tmp_description=' chloroplast, partial genome (concatenated contigs)'
        manifest={
            'STUDY':ENA_project_number,
            'SAMPLE':row.ENASampleNum,
            'ASSEMBLYNAME':row.ENA_SeqID,
            'ASSEMBLY_TYPE':'clone or isolate',
            'COVERAGE':row.Coverage_base,
            'PROGRAM':'GetOrganelle v1.7.5 +  GeSeq 2.03',
            'PLATFORM':'ILLUMINA',
            'MOLECULETYPE':'genomic DNA',
            'FLATFILE': row.ENA_SeqID + '.embl.gz',
            'CHROMOSOME_LIST': row.ENA_SeqID + '_CHR.txt.gz',
            'RUN_REF': row.ENARunNumber,
            'DESCRIPTION': row.NCBI_sciname + tmp_description,
        }
        # Need to submit SRA and GAP data to ENA under third party annotation (TPA):
        ### 23.11.2022 - not submitting SRA or GAP data via TPA now
        ### if DataSource == 'SRA' or DataSource == 'GAP':
        ###    manifest['TPA'] = 'true'
        ###    manifest['DESCRIPTION'] = 'TPA_asm: ' + row.NCBI_sciname + tmp_description
#         if row.Assembly in ['scaffold']:
#             manifest['PARTIAL']='TRUE'
        manifest = pd.DataFrame.from_dict(manifest,orient='index')
        manifest.to_csv(manifest_file,header=None,sep='\t')

        # Submission command
        commands.append('java -jar $WEBIN ' + 
              '-username Webin-52995 -passwordFile ../../../../ENA_submissions/ena_pwd.txt -context genome -manifest ' +
              row.ENA_SeqID + '_manifest.txt' + ' -validate') 
    except:
        print('issue modifying Genbank file or making embl file',row.SeqID)
pd.DataFrame(commands).to_csv(sdir + '/ENA_Submission_Commands_' + DataSource + '.sh',index=False,header=None)

ERR4180132 ERR4180132_pt > 32036
removed 0 partial features
32 gaps
Number of feats 127
ERR420512 ERR420512_pt > 39515
removed 1 partial features
14 gaps
Number of feats 106
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERR420513 ERR420513_pt > 28388
removed 0 partial features
30 gaps
Number of feats 117
ERR420518 ERR420518_pt > 64889
removed 0 partial features
23 gaps
Number of feats 161
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERR420521 ERR420521_pt > 52371
removed 2 partial features
21 gaps
Number of feats 150
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet

SRR6425615 SRR6425615_pt > 155587
removed 2 partial features
3 gaps
Number of feats 337
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
SRR6425630 SRR6425630_pt > 129587
removed 1 partial features
2 gaps
Number of feats 282
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
SRR6425632 SRR6425632_pt > 154463
removed 2 partial features
0 gaps
Number of feats 334
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
SRR6787480 SRR6787480_pt > 44480
removed 0 partial features
15 gaps
Number of feats 112
SRR799414 SRR799414_pt > 35591
removed 0 partial features
37 gaps
Number of feats 137
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequen

SRR9119016 SRR9119016_pt > 128291
removed 2 partial features
2 gaps
Number of feats 281
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
ERROR: can't assign a locus tag ID for a gene that hasn't been defined along the sequence yet!
SRR9118957 SRR9118957_pt > 27587
removed 0 partial features
9 gaps
Number of feats 88


In [14]:
submit_pt.columns

Index(['Sample_Name', 'idPaftol', 'idSequencing', 'Project', 'order', 'family',
       'genus', 'species', 'Taxonomical_Notes', 'SumContigLength', 'sci_name',
       'log_pt', 'log_nr', 'pt_recovered', 'Nseq_pt', 'Sum_len_pt',
       'Nanybase_pt', 'nr_recovered', 'Nseq_nr', 'Sum_len_nr', 'Nanybase_nr',
       'Completed_Reads', 'Completed_Extending', 'Completed_Assembly',
       'Info_Disentangling_Failed', 'Assembly', 'Coverage_Kmer',
       'Coverage_base', 'Run_Time', 'maxK', 'NRepeat_Pattern', 'NPath',
       'Redo_FastPlast', 'error_pt', 'error_nr', 'SeqID', 'DataSource',
       'ENASampleNum', 'ENAExpNumber', 'ENARunNumber', 'NCBI_TaxID',
       'NCBI_sciname', 'Ref-SeqID', 'Ref-order', 'Ref-family',
       'Blast-N_qseqid_match', 'Concat-len', 'Concat-%_mapped',
       'Concat-len_with_Ns', 'ENA_SeqID'],
      dtype='object')

* The ENA_Submission_Commands_PAFTOL.txt file contains a WEBIN-CLI command for each sample annotation.
* Command assumes that you move into the /EMBL_files/ folder to run it.
* Launch command is now a shell script e.g.:
    * cd /EMBL_files
    * chmod 744 ../ENA_Submission_Commands_PAFTOL.sh
    * ../ENA_Submission_Commands_${DataSource}.sh > ENA_Submission_Commands_${DataSource}.log 2>&1 &
* Can validate samples (-validate flag) on Macbook but need to use JHI cluster to submit samples (-submit flag)
* Should also use Slurm on the JHI cluster to submit:
    * sbatch --wrap "./ENA_Submission_Commands_${DataSource}_-submit.sh > ./ENA_Submission_Commands_${DataSource}_-submit.log 2>&1 "