In [1]:
import pandas as pd
import os

In [2]:
project='cpTree_v3'

In [3]:
### List all samples for the tree
# Load paftol_pt
paftol_pt = pd.read_csv('../Organelles/PAFTOL/PAFTOL_Organelle_Recovery.csv').rename(columns={'Order':'order',
                                            'Family':'family','Genus':'genus','Sum_len_pt':'length'})
paftol_pt['species'] = paftol_pt.sci_name.str.split(' ').str[1:].str.join(' ')
print('PAFTOL:',paftol_pt.shape[0],end=' < ')
paftol_ls = [file.replace('_pt.fasta','') for file in os.listdir('../Organelles/PAFTOL/fasta_pt/') if file.endswith('.fasta')]
paftol_pt = paftol_pt[paftol_pt.Sample_Name.isin(paftol_ls)]
paftol_pt['DataSource']='PAFTOL'
print(len(paftol_ls), '>',paftol_pt.shape[0])

# Load sra_pt
sra_pt = pd.read_csv('../Organelles/SRA/SRA_Organelle_Recovery.csv').rename(columns={'Sum_len_pt':'length'})
sra_pt['species'] = sra_pt.sci_name.str.split(' ').str[1:].str.join(' ')
print('SRA:',sra_pt.shape[0],end=' < ')
SRA_ls = [file.replace('_pt.fasta','') for file in os.listdir('../Organelles/SRA/fasta_pt/') if file.endswith('.fasta')]
sra_pt = sra_pt[sra_pt.Sample_Name.isin(SRA_ls)]
sra_pt['DataSource']='SRA'
print(len(SRA_ls), '>',sra_pt.shape[0])

# Load gap_pt
gap_pt = pd.read_csv('../Organelles/GAP/GAP_Organelle_Recovery.csv').rename(columns={'Sum_len_pt':'length'})
gap_pt['species'] = gap_pt.sci_name.str.split(' ').str[1:].str.join(' ')
print('GAP:',gap_pt.shape[0],end=' < ')
GAP_ls = [file.replace('_pt.fasta','') for file in os.listdir('../Organelles/GAP/fasta_pt/') if file.endswith('.fasta')]
gap_pt = gap_pt[gap_pt.Sample_Name.isin(GAP_ls)]
gap_pt['DataSource']='GAP'
print(len(GAP_ls), '>',gap_pt.shape[0])

# Load 1kp_pt
okp_pt = pd.read_csv('1KP/okp_pt.csv').rename(columns={'Sample':'Sample_Name',"('len', 'sum_len')":'length'})
okp_pt['species'] = okp_pt.sci_name.str.split(' ').str[1:].str.join(' ')
print('1KP:',okp_pt.shape[0],end=' < ')
okp_ls = [file.replace('_CPgenes.fasta','') for file in os.listdir('1KP/1kp_fasta_pt/') if file.endswith('.fasta')]
okp_pt = okp_pt[okp_pt.Sample_Name.isin(okp_ls)]
okp_pt['DataSource']='OneKP'
print(len(okp_ls), '>',okp_pt.shape[0])

# Load Li et al
Lietal_pt = pd.read_csv('Li_et_al/Lietal_PPA_fasta_wcvp.csv').rename(columns={'Sample':'Sample_Name'})
print('Li et al:',Lietal_pt.shape[0],end=' < ')
lietal_ls = [file.replace('_CPgenes.fasta','') for file in os.listdir('Li_et_al/Lietal_PPA/') if file.endswith('.fasta')]
Lietal_pt = Lietal_pt[Lietal_pt.Sample_Name.isin(lietal_ls)]
Lietal_pt['DataSource']='Lietal'; Lietal_pt['length']=0
print(len(lietal_ls), '>',Lietal_pt.shape[0])

# Load NCBI refseq
refseq_pt = pd.read_csv('NCBI/RefSeq/refseq_sp.csv').rename(columns={'SeqID':'Sample_Name'})
print('RefSeq:',refseq_pt.shape[0],end=' < ')
refseq_ls = [file.replace('_pt.fasta','') for file in os.listdir('NCBI/RefSeq/Refseq_fasta_pt/') if file.endswith('.fasta')]
refseq_pt = refseq_pt[refseq_pt.Sample_Name.isin(refseq_ls)]
refseq_pt['DataSource']='REFSEQ'
print(len(refseq_ls), '>',refseq_pt.shape[0])

# Load NCBI INSDC (other plastomes, partial)
insdc_pt = pd.read_csv('NCBI/INSDC/insdc_sp.csv').rename(columns={'SeqID':'Sample_Name'})
print('INSDC:',insdc_pt.shape[0],end=' < ')
insdc_ls = [file.replace('_pt.fasta','') for file in os.listdir('NCBI/INSDC/INSDC_fasta_pt/') if file.endswith('.fasta')]
insdc_pt = insdc_pt[insdc_pt.Sample_Name.isin(insdc_ls)]
insdc_pt['DataSource']='INSDC'
print(len(insdc_ls), '>',insdc_pt.shape[0])

# List gymno
gymno_pt = pd.read_csv('NCBI/Gymnosperms/Gymnosperms_raw_SeqID.csv').rename(columns={'SeqID':'Sample_Name'})
gymno_pt['genus'] = gymno_pt.sci_name.str.split(' ').str[0]
gymno_pt['species'] = gymno_pt.sci_name.str.split(' ').str[1:].str.join(' ')
print('Root (Gymnosperms):',gymno_pt.shape[0],end=' < ')
gymno_ls = [file.replace('_pt.fasta','') for file in os.listdir('NCBI/Gymnosperms/') if file.endswith('.fasta')]
gymno_pt = gymno_pt[gymno_pt.Sample_Name.isin(gymno_ls)]
gymno_pt['DataSource']='Gymno'
print(len(gymno_ls), '>',gymno_pt.shape[0])

PAFTOL: 8125 < 7890 > 7530
SRA: 1298 < 1538 > 1226
GAP: 1455 < 740 > 740
1KP: 744 < 744 > 744
Li et al: 942 < 935 > 942
RefSeq: 5176 < 5176 > 5176
INSDC: 2916 < 2916 > 2916
Root (Gymnosperms): 10 < 11 > 10


In [9]:
col_keep=['Sample_Name','family','genus','species','sci_name','DataSource','length']
pd.concat([ paftol_pt[col_keep], sra_pt[col_keep], gap_pt[col_keep], okp_pt[col_keep], Lietal_pt[col_keep],
                    refseq_pt[col_keep], insdc_pt[col_keep], gymno_pt[col_keep]], ignore_index=True)\
    .to_csv(project + '/AllSamples_' + project + '.csv',index=False)

In [7]:
# Reduce public accessions to one by species, 3 per genus, with priority to Refseq
NCBI_pt = refseq_pt.groupby('sci_name').head(1).groupby('genus').head(3)
NCBI_genus = NCBI_pt.groupby('genus').genus.count().to_frame().rename(columns={'genus':'N'}).reset_index()
print(NCBI_genus.groupby('N').size().to_dict())
NCBI_pt = pd.concat([NCBI_pt,
        insdc_pt[(insdc_pt.Sample_Name.isin(NCBI_pt.Sample_Name)==False) & (insdc_pt.genus.isin(NCBI_genus[NCBI_genus.N<3].genus))] ])
NCBI_pt = NCBI_pt.groupby('sci_name').head(1).groupby('genus').head(3)
NCBI_genus = NCBI_pt.groupby('genus').genus.count().to_frame().rename(columns={'genus':'N'}).reset_index()
print(NCBI_genus.groupby('N').size().to_dict())

{1: 1313, 2: 288, 3: 422}
{1: 1133, 2: 311, 3: 579}


In [8]:
# Merge lists of samples
col_keep=['Sample_Name','genus','species','sci_name','DataSource','length']
all_pt = pd.concat([ paftol_pt[col_keep], sra_pt[col_keep], gap_pt[col_keep], okp_pt[col_keep], Lietal_pt[col_keep],
                    NCBI_pt[col_keep], gymno_pt[col_keep]], ignore_index=True)
print(all_pt.shape[0],'s:',all_pt.sci_name.nunique(),'g:',all_pt.genus.nunique())

14685 s: 12080 g: 7855


In [9]:
# Define root samples
root_1kp = pd.read_table('../Phylogenetic_Tree/Release_1.0/OneKP_root.txt',sep=' ',header=None).rename(columns={
    0:'order',1:'family',2:'genus',3:'species',4:'Sample_Name'})
root = pd.concat([root_1kp,gymno_pt]).drop(columns=['genus','species','sci_name','DataSource','length'])
root = pd.merge(root,all_pt,on='Sample_Name',how='inner')
print(root.shape[0], 'o:',root.order.nunique(),'f:',root.family.nunique(),'g:',root.genus.nunique(),'s:',root.sci_name.nunique())
root[:2]

20 o: 8 f: 9 g: 17 s: 20


Unnamed: 0,order,family,Sample_Name,TaxID,mol_type,genus,species,sci_name,DataSource,length
0,Cycadales,Cycadaceae,XZUY,,,Cycas,micholitzii,Cycas micholitzii,OneKP,36702.0
1,Ginkgoales,Ginkgoaceae,SGTW,,,Ginkgo,biloba,Ginkgo biloba,OneKP,36327.0


In [10]:
# Add class, order and genus info from WCVP
wcvp = pd.read_csv('../PAFTOL_DB/wcvp_genus.csv'); wcvp.columns = map(str.lower, wcvp.columns)
wcvp = wcvp.sort_values('nentries',ascending=False).groupby('genus').head(1).sort_values('genus')
all_pt = pd.merge(all_pt,wcvp,how='inner',on='genus')
print(all_pt.groupby('class').size().to_dict())
all_pt = pd.concat([ all_pt, root ])

{'Incertae Sedis': 23, 'Plantae': 6, 'angiosperms': 130, 'asterids': 839, 'campanulids': 1659, 'core eudicots': 23, 'eudicots': 377, 'fabids': 1597, 'lamiids': 1774, 'magnoliids': 384, 'malvids': 2265, 'monocots': 3069, 'rosids': 1206, 'superasterids': 987, 'superrosids': 197}


In [11]:
print('o:',all_pt.order.nunique(),'f:',all_pt.family.nunique(),'g:',all_pt.genus.nunique(),'s:',all_pt.sci_name.nunique())
print(all_pt.groupby('DataSource').size().to_dict())
print(all_pt.isna().sum().to_dict())
print(all_pt.shape[0],end=' > ')
all_pt = all_pt.groupby('Sample_Name').head(1) # Make sure IDs are unique
print(all_pt.shape[0],end=' > ')
# all_pt = all_pt[(all_pt.order.notnull()) & (all_pt.family.notnull()) & (all_pt.genus!='NILL')
#                & (all_pt.species.notnull()) & (all_pt.sci_name.str.contains(' sp.',regex=False)==False)]
all_pt = all_pt[(all_pt.order.notnull()) & (all_pt.family.notnull()) & (all_pt.genus!='NILL')
               & (all_pt.species.notnull())]
print(all_pt.shape[0])
print(all_pt.isna().sum().to_dict())
print(all_pt.groupby('DataSource').size().to_dict())
all_pt = all_pt.sort_values(['DataSource','order','family','genus']).reset_index(drop=True)
print('o:',all_pt.order.nunique(),'f:',all_pt.family.nunique(),'g:',all_pt.genus.nunique(),'s:',all_pt.sci_name.nunique())

o: 73 f: 425 g: 7762 s: 11958
{'GAP': 737, 'Gymno': 10, 'INSDC': 337, 'Lietal': 942, 'OneKP': 677, 'PAFTOL': 7484, 'REFSEQ': 3148, 'SRA': 1221}
{'Sample_Name': 0, 'genus': 0, 'species': 54, 'sci_name': 9, 'DataSource': 0, 'length': 0, 'order': 0, 'class': 20, 'family': 0, 'nentries': 20, 'TaxID': 14546, 'mol_type': 14546}
14556 > 14542 > 14488
{'Sample_Name': 0, 'genus': 0, 'species': 0, 'sci_name': 0, 'DataSource': 0, 'length': 0, 'order': 0, 'class': 20, 'family': 0, 'nentries': 20, 'TaxID': 14478, 'mol_type': 14478}
{'GAP': 737, 'Gymno': 10, 'INSDC': 329, 'Lietal': 904, 'OneKP': 677, 'PAFTOL': 7475, 'REFSEQ': 3141, 'SRA': 1215}
o: 73 f: 425 g: 7759 s: 11917


In [12]:
# Make list and labels
# all_pt['Label'] = all_pt.order + '_' + all_pt.family + '_' + all_pt.genus + '_' + all_pt.species.str.replace(' ','-') + '_' \
#      + (all_pt.length/1000).round(0).astype(int).astype('str') + 'kb_' + all_pt.Sample_Name.str.replace('_','')  + \
#     '_' + all_pt.DataSource.str[:6]
all_pt['Label'] = all_pt.order + '_' + all_pt.family + '_' + all_pt.genus + '_' + all_pt.species.str.replace(' ','-') + '_' \
     + all_pt.Sample_Name.str.replace('_','')
all_pt.Label=all_pt.Label.str.replace(' ','')#.str.replace('.','_')
print(all_pt.isna().sum().to_dict())
all_pt.to_csv(project + '/' + project + '.csv',index=False)
all_pt[['Sample_Name','Label']].to_csv(project + '/' + project + '_labels.txt',index=False,header=None,sep=' ')

{'Sample_Name': 0, 'genus': 0, 'species': 0, 'sci_name': 0, 'DataSource': 0, 'length': 0, 'order': 0, 'class': 20, 'family': 0, 'nentries': 20, 'TaxID': 14478, 'mol_type': 14478, 'Label': 0}


In [13]:
# wcvp = wcvp[wcvp['class']=='Magnoliopsida']
print(wcvp[wcvp.order.isin(all_pt.order)].order.nunique(),'/',wcvp.order.nunique(),'orders')
print(wcvp[wcvp.family.isin(all_pt.family)].family.nunique(),'/',wcvp.family.nunique(),'families')
print('missing families:',wcvp[wcvp.family.isin(all_pt.family)==False].family.unique())
print(wcvp[wcvp.genus.isin(all_pt.genus)].genus.nunique(),'/',wcvp.genus.nunique(),'genera')

65 / 65 orders
416 / 418 families
missing families: ['Euphroniaceae' 'Mitrastemonaceae']
7742 / 13543 genera


## Validation

In [6]:
import pandas as pd
import numpy as np
from ete3 import Tree, NodeStyle, TreeStyle, TextFace, AttrFace, faces
import warnings; warnings.filterwarnings('ignore')
import os

In [5]:
def Treetaxo(df):
    print('N:',df.shape[0],'o:',df.order.nunique(),'f:',df.family.nunique(),'g:',df.genus.nunique())

In [19]:
Release_df = pd.read_csv(project + '/' + project + '.csv').rename(columns={'Sample_Name':'Sample'})
print(Release_df.shape[0])
Release_df[:2]

14488


Unnamed: 0,Sample,genus,species,sci_name,DataSource,length,order,class,family,Label
0,GAP_026547,Albidella,oligococca,Albidella oligococca,GAP,59917,Alismatales,monocots,Alismataceae,Alismatales_Alismataceae_Albidella_oligococca_...
1,GAP_028289,Astonia,australiensis,Astonia australiensis,GAP,8956,Alismatales,monocots,Alismataceae,Alismatales_Alismataceae_Astonia_australiensis...


In [20]:
PV = pd.read_csv(project + '/' + project + '_0.2_iqtree_labelled_table.csv').reset_index().rename(columns={
    'index':'Tidx','label':'Label'})
# PV['Sample'] = PV['Sample'].str.replace('GAP','GAP_').str.replace('PAFTOL','PAFTOL_').str.replace('NC','NC_').str.replace('AC','AC_')
print(PV.shape[0])
print('Phylogenetic Validation:',PV.groupby('Phylogenetic_Validation').size().to_dict())
PV[:2]

8330
Phylogenetic Validation: {'Confirmed': 8162, 'Inconclusive': 56, 'Rejected': 112}


Unnamed: 0,Tidx,Label,order,family,genus,species,Sample,Phylogenetic_Validation
0,0,Apiales_Araliaceae_Motherwellia_haplosciadea_G...,Apiales,Araliaceae,Motherwellia,haplosciadea,GAP022353,Inconclusive
1,1,Apiales_Pittosporaceae_Auranticarpa_rhombifoli...,Apiales,Pittosporaceae,Auranticarpa,rhombifolia,GAP026913,Confirmed


In [21]:
BV = pd.read_excel(project + '/' + project + '_Barcode_Validation.xlsx',sheet_name='Validation_Results')
print(BV.shape[0])
print('Barcode Validation:',BV.groupby('Validation').size().to_dict())
BV[:2]

14478
Barcode Validation: {'Confirmed': 12599, 'Inconclusive': 1447, 'Rejected': 432}


Unnamed: 0,Sample,genus,species,sci_name,DataSource,length,order,class,family,Label,...,VATfam,VATgen,Vfam_pc,Vgen_pc,best matching family,N best family,best matching genus,N best genus,Validation,Validation_genus
0,GAP_026547,Albidella,oligococca,Albidella oligococca,GAP,59917,Alismatales,monocots,Alismataceae,Alismatales_Alismataceae_Albidella_oligococca_...,...,5,1,83.0,50.0,Butomaceae,1,Ranalisma,1.0,Confirmed,Confirmed
1,GAP_028289,Astonia,australiensis,Astonia australiensis,GAP,8956,Alismatales,monocots,Alismataceae,Alismatales_Alismataceae_Astonia_australiensis...,...,1,0,100.0,,,0,,,Confirmed,Inconclusive


In [22]:
Release_df = pd.merge(left = Release_df, right = PV[['Label','Phylogenetic_Validation','Tidx']].rename(columns={
    'Phylogenetic_Validation':'PhylogeneticValidation'}),
                     how = 'inner', on = 'Label')
Release_df = pd.merge(left = Release_df, right = BV[['Sample','Validation']].rename(columns={'Validation':'BarcodeValidation'}),
                     how = 'left', on = 'Sample').sort_values('Tidx').reset_index(drop=True)
print(Release_df.shape[0])

8329


In [23]:
print(Release_df.isna().sum().to_dict())
print(PV[PV.Label.isin(Release_df.Label)==False])

{'Sample': 0, 'genus': 0, 'species': 0, 'sci_name': 0, 'DataSource': 0, 'length': 0, 'order': 0, 'class': 20, 'family': 0, 'Label': 0, 'PhylogeneticValidation': 0, 'Tidx': 0, 'BarcodeValidation': 10}
      Tidx                                         Label       order  \
3071  3071  Sapindales_Rutaceae_Citrus_Ã—-aurantium_UHJR  Sapindales   

        family   genus       species Sample Phylogenetic_Validation  
3071  Rutaceae  Citrus  Ã—-aurantium   UHJR               Confirmed  


In [24]:
Release_df['Decision']=np.nan
Release_df.loc[(Release_df.PhylogeneticValidation=='Confirmed') & (Release_df.BarcodeValidation=='Confirmed'), 
           'Decision']='Include'
Release_df.loc[(Release_df.PhylogeneticValidation=='Confirmed') & (Release_df.BarcodeValidation=='Inconclusive'), 
           'Decision']='Include'
Release_df.loc[(Release_df.PhylogeneticValidation=='Confirmed') & (Release_df.BarcodeValidation=='Rejected'), 
           'Decision']='Review'

Release_df.loc[(Release_df.PhylogeneticValidation=='Inconclusive') & (Release_df.BarcodeValidation=='Confirmed'), 
           'Decision']='Review'
Release_df.loc[(Release_df.PhylogeneticValidation=='Inconclusive') & (Release_df.BarcodeValidation=='Inconclusive'), 
           'Decision']='Review'
Release_df.loc[(Release_df.PhylogeneticValidation=='Inconclusive') & (Release_df.BarcodeValidation=='Rejected'), 
           'Decision']='Exclude'

Release_df.loc[(Release_df.PhylogeneticValidation=='Rejected') & (Release_df.BarcodeValidation=='Confirmed'), 
           'Decision']='Review'
Release_df.loc[(Release_df.PhylogeneticValidation=='Rejected') & (Release_df.BarcodeValidation=='Inconclusive'), 
           'Decision']='Exclude'
Release_df.loc[(Release_df.PhylogeneticValidation=='Rejected') & (Release_df.BarcodeValidation=='Rejected'), 
           'Decision']='Exclude'
# barcode validation against phylogenetic validation
Release_df.groupby(['PhylogeneticValidation','BarcodeValidation']).size().to_frame().reset_index().rename(columns={0:'Count'})\
    .pivot(index='PhylogeneticValidation', columns='BarcodeValidation', values='Count').to_clipboard()
Release_df.groupby(['PhylogeneticValidation','BarcodeValidation']).size().to_frame().reset_index().rename(columns={0:'Count'})\
    .pivot(index='PhylogeneticValidation', columns='BarcodeValidation', values='Count')

BarcodeValidation,Confirmed,Inconclusive,Rejected
PhylogeneticValidation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Confirmed,8026,89,37
Inconclusive,50,2,3
Rejected,16,1,95


In [25]:
Treetaxo(Release_df)
print(Release_df.groupby('DataSource').size().to_dict())
print(Release_df.groupby('Decision').size().to_dict())
Release_df[:2]

N: 8329 o: 72 f: 396 g: 4406
{'GAP': 163, 'Gymno': 10, 'INSDC': 326, 'Lietal': 902, 'OneKP': 666, 'PAFTOL': 2507, 'REFSEQ': 3118, 'SRA': 637}
{'Exclude': 99, 'Include': 8115, 'Review': 105}


Unnamed: 0,Sample,genus,species,sci_name,DataSource,length,order,class,family,Label,PhylogeneticValidation,Tidx,BarcodeValidation,Decision
0,GAP_022353,Motherwellia,haplosciadea,Motherwellia haplosciadea,GAP,120594,Apiales,campanulids,Araliaceae,Apiales_Araliaceae_Motherwellia_haplosciadea_G...,Inconclusive,0,Confirmed,Review
1,GAP_026913,Auranticarpa,rhombifolia,Auranticarpa rhombifolia,GAP,79637,Apiales,campanulids,Pittosporaceae,Apiales_Pittosporaceae_Auranticarpa_rhombifoli...,Confirmed,1,Confirmed,Include


In [26]:
root_orders = ['Pinales','Gnetales','Cupressales','Ginkgoales','Ephedrales','Cupressales','Araucariales','Cycadales','Taxales']
Treetaxo(Release_df[Release_df.order.isin(root_orders)==False])

N: 8309 o: 64 f: 387 g: 4389


In [27]:
print('% of Exclusion or Review by source')
round(Release_df[Release_df.Decision!='Include'].groupby('DataSource').size()/Release_df.groupby('DataSource').size()*100,1)

% of Exclusion or Review by source


DataSource
GAP         3.1
Gymno     100.0
INSDC       0.6
Lietal      1.7
OneKP       2.1
PAFTOL      4.5
REFSEQ      1.3
SRA         2.2
dtype: float64

In [29]:
# Output file for review process
review_columns=['Sample', 'Label', 'DataSource', 'order', 'family', 'genus', 'species','length',
                 'PhylogeneticValidation', 'BarcodeValidation','Decision']
review_df = pd.concat( [Release_df[review_columns],
                    pd.DataFrame(columns=['DecisionReview','DecisionReason','ValidationComments']) ] )
review_df.to_excel(project + '/' + project + '_Review.xlsx',index=False)

In [None]:
## Tree for review
# https://www.biostars.org/p/172568/
from ete3 import Tree, NodeStyle, TreeStyle, TextFace, AttrFace, faces

tree_file = project + '/' + project + '_0.2_iqtree_labelled_rooted.nwk'
T=Tree(tree_file)
#Convert to ultrametric
most_distant_leaf, tree_length = T.get_farthest_leaf()
current_dist = 0
for postorder, node in T.iter_prepostorder():
    if postorder:
        current_dist -= node.dist
    else:
        if node.is_leaf():
            node.dist += tree_length - (current_dist + node.dist)
        elif node.up: # node is internal
            current_dist += node.dist
            
ts = TreeStyle()
# # ts.show_branch_length = False
# # ts.scale = 2
# # ts.mode = "r"
# ts.show_branch_support = True
# ts.layout_fn = layout
ts.show_leaf_name = True
            
for n in T.iter_leaves():
#     print(n.name, end=',')
    nstyle = NodeStyle()
    print(n.name, end=',')
    if n.name == '':
        nstyle["size"] = 0
        n.set_style(nstyle)
    else:
        tmp=Release_df[Release_df.Label==n.name]
#         print(tmp)

        ## FIX point style should be based on Barcode Validation
        if tmp.shape[0]>0:
            if (tmp.Decision.isin(['Include']).bool()==True):
                nstyle["fgcolor"] = "green"
                nstyle["size"] = 10
            elif (tmp.Decision.isin(['Exclude']).bool()==True):
                nstyle["fgcolor"] = "red"
                nstyle["size"] = 10
            elif (tmp.Decision.isin(['Review']).bool()==True):
                nstyle["fgcolor"] = "black"
                nstyle["size"] = 10
            else:
                nstyle["fgcolor"] = "blue"
                nstyle["size"] = 8

            if (tmp.BarcodeValidation.isin(['Confirmed']).bool()==True):
                nstyle["shape"] = "circle"
            elif (tmp.BarcodeValidation.isin(['Rejected']).bool()==True):
                nstyle["shape"] = "circle"
            elif (tmp.BarcodeValidation.isin(['Inconclusive']).bool()==True):
                nstyle["shape"] = "square"
            
            n.set_style(nstyle)
        else:
            nstyle["size"] = 0
            n.set_style(nstyle)
T.render(project + '/' + project + "_pretree.svg", w=150, units="mm",dpi=150,tree_style=ts);

## Final tree

In [9]:
# Read review file
Review = pd.read_excel(project + '/' + project + '_pretree_Review.xlsx')
print(Review.shape[0])
Treetaxo(Review)
Review[:2]

8329
N: 8329 o: 72 f: 396 g: 4406


Unnamed: 0,Sample,Label,DataSource,order,family,genus,species,length,PhylogeneticValidation,BarcodeValidation,Decision,DecisionReview,DecisionReason,ValidationComments
0,PAFTOL_008669,Commelinales_Commelinaceae_Floscopa_glomerata_...,PAFTOL,Commelinales,Commelinaceae,Floscopa,glomerata,40762,Confirmed,Rejected,Review,Include,,
1,GAP_026913,Apiales_Pittosporaceae_Auranticarpa_rhombifoli...,GAP,Apiales,Pittosporaceae,Auranticarpa,rhombifolia,79637,Confirmed,Confirmed,Include,,,


In [17]:
Review = Review[(Review.Decision=='Include') | (Review.DecisionReview=='Include')]
Treetaxo(Review)
root_orders = ['Pinales','Gnetales','Cupressales','Ginkgoales','Ephedrales','Cupressales','Araucariales','Cycadales','Taxales']
Treetaxo(Review[Review.order.isin(root_orders)==False])
print(Review.isna().sum().to_dict())
print(round(Review.groupby('DataSource').size()/Review.shape[0]*100,1))

N: 8222 o: 72 f: 396 g: 4346
N: 8202 o: 64 f: 387 g: 4329
{'Sample': 0, 'Label': 0, 'DataSource': 0, 'order': 0, 'family': 0, 'genus': 0, 'species': 0, 'length': 0, 'PhylogeneticValidation': 0, 'BarcodeValidation': 10, 'Decision': 10, 'DecisionReview': 8115, 'DecisionReason': 8222, 'ValidationComments': 8162}
DataSource
GAP        2.0
Gymno      0.1
INSDC      3.9
Lietal    10.9
OneKP      8.1
PAFTOL    29.5
REFSEQ    37.8
SRA        7.7
dtype: float64


In [14]:
Review.to_csv(project + '/' + project + '_final.csv',index=False)
Review[['Sample','Label']].to_csv(project + '/' + project + '_final_labels.txt',index=False,header=None,sep=' ')