# Phylogeny of *Muscari* using genomic ddRAD data

In [1]:
## conda install ipyrad -c ipyrad
## conda install toytree -c eaton-lab
## conda install sra-tools -c bioconda
## conda install entrez-direct -c bioconda

In [1]:
## import
import ipyrad as ip
import ipyrad.analysis as ipa
import ipyparallel as ipp
import pandas as pd
import toytree
import toyplot

## print Version of ipyrad und toytree
print("ipyrad v. {}".format(ip.__version__))
print("toytree v. {}".format(toytree.__version__))

## print Version of Python
from platform import python_version
print("Python v.", python_version())

ipyrad v. 0.9.64
toytree v. 2.0.5
Python v. 3.7.9


#### Parallel processes on independent Python kernels
To start a parallel client you must run the command-line program 'ipcluster'. This will essentially start a number of independent Python processes (kernels) which we can then send bits of work to do. The cluster can be stopped and restarted independently of this notebook, which is convenient for working on a cluster where connecting to many cores is not always immediately available.

Open a terminal and type the following command to start an ipcluster instance with N engines.

In [3]:
## ipcluster start --n=16

In [2]:
## connect to cluster
ipyclient = ipp.Client()
print(ip.cluster_info(ipyclient))

Parallel connection | Cryptantha: 48 cores
None


## Data Assembly
### Create an Assembly object and modify *ipyrad* params file
This object stores the parameters of the assembly and the organization of the data

In [261]:
## Provide a name for the assembly
data = ip.Assembly("Muscari")

New Assembly: Muscari


In [262]:
## set parameters
data.set_params("project_dir", "Mus_Assembly")
data.set_params("sorted_fastq_path", "./Mus_fastq/*.fastq.gz")
data.set_params("clust_threshold", "0.85")
data.set_params("max_Hs_consens", (0.05))
data.set_params("restriction_overhang", ('TGCAG', 'GGCC'))
data.set_params("output_formats", "*")
data.set_params("datatype", "ddrad")

## see / print all parameters
data.get_params()

0   assembly_name               Muscari                                      
1   project_dir                 ./Mus_Assembly                               
2   raw_fastq_path                                                           
3   barcodes_path                                                            
4   sorted_fastq_path           ./Mus_fastq/*.fastq.gz                       
5   assembly_method             denovo                                       
6   reference_sequence                                                       
7   datatype                    ddrad                                        
8   restriction_overhang        ('TGCAG', 'GGCC')                            
9   max_low_qual_bases          5                                            
10  phred_Qscore_offset         33                                           
11  mindepth_statistical        6                                            
12  mindepth_majrule            6                               

### Assemble the data from step 1 to 6

In [263]:
## run step 1 to 6 of the assembly
data.run("123456", force = True)

Parallel connection | Cryptantha: 48 cores
[####################] 100% 0:00:02 | loading reads        | s1 |
[####################] 100% 0:00:24 | processing reads     | s2 |
[####################] 100% 0:00:05 | dereplicating        | s3 |
[####################] 100% 0:06:55 | clustering/mapping   | s3 |
[####################] 100% 0:00:01 | building clusters    | s3 |
[####################] 100% 0:00:00 | chunking clusters    | s3 |
[####################] 100% 0:04:08 | aligning clusters    | s3 |
[####################] 100% 0:00:16 | concat clusters      | s3 |
[####################] 100% 0:00:01 | calc cluster stats   | s3 |
[####################] 100% 0:00:14 | inferring [H, E]     | s4 |
[####################] 100% 0:00:01 | calculating depths   | s5 |
[####################] 100% 0:00:01 | chunking clusters    | s5 |
[####################] 100% 0:02:00 | consens calling      | s5 |
[####################] 100% 0:00:02 | indexing alleles     | s5 |
[####################] 100% 0:00:

In [264]:
#data.stats.sort_values(by=['reads_consens'])
data.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
Bellevalia_dubia_W6083,6,1030736,1029284,95294,20981,0.013,0.003,19338
Bellevalia_paradoxa_ED1272,6,1636142,1634727,108498,30267,0.013,0.003,27749
Bellevalia_speciosa_W6085,6,1416391,1414294,95536,25347,0.016,0.003,22957
Brimeura_amethystina_W6084,6,1554459,1551802,424844,28296,0.034,0.005,20711
Leopoldia_caucasica_ED1262,6,1462581,1461153,77305,22469,0.013,0.003,20653
Leopoldia_comosa_ED1256,6,1299389,1298312,90402,25831,0.015,0.003,23363
Leopoldia_comosa_ED1274,6,1464810,1463759,90898,24322,0.015,0.002,22075
Leopoldia_comosa_ED3539,6,2065757,2064748,368808,46895,0.013,0.003,42303
Leopoldia_comosa_ED3965,6,1232250,1231244,94455,25479,0.013,0.003,23472
Leopoldia_cycladica_W6082,6,1664161,1661171,152200,36680,0.025,0.003,30683


In [10]:
## show assemby stats until step 6
#data.stats
data.stats.sort_values(by = ['reads_consens'])

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
Muscari_anatolicum_W6087,6,1224126,1221220,56984,18627,0.015,0.003,17017
Muscari_adilii_W6090,6,1234147,1231484,117636,19475,0.018,0.003,17235
Muscari_pulchellum_ED3231,6,1007408,1006385,69798,19257,0.013,0.003,17715
Muscari_armeniacum_W6089,6,1436401,1433387,83343,19937,0.018,0.003,17821
Muscari_mirum_ED1250,6,1147548,1146637,80280,20292,0.013,0.002,18748
Bellevalia_dubia_W6083,6,1030736,1029284,95294,20981,0.013,0.003,19338
Pseudomuscari_coeruleum_ED1261,6,1533268,1532125,93250,21469,0.012,0.002,19834
Leopoldia_tenuiflora_ED1263,6,983193,982239,73834,21497,0.009,0.003,20148
Leopoldia_caucasica_ED1262,6,1462581,1461153,77305,22469,0.013,0.003,20653
Leopoldia_neumannii_ED1243,6,1022622,1022000,74881,21887,0.008,0.003,20671


## Final assembly with different `min_samples_locus` settings for different analysis

1. Phylogenetic analysis 
    - RAxML
    - MrBayes
    - tetRAD
2. Population analysis of selected clades
    - PCA
    - STRUCTURE
    - TreeMix
3. Test for introgression using abba-baba test
    - ...
    
#### In case comming back to continue from here, load assembly object to continue after step 6

In [2]:
## load assembly object when comming back
data = ip.load_json("./Mus_Assembly/Muscari.json")

## check again the stat-file sorted by number of consensus reads
#data.stats.sort_values(by=['reads_consens'])

## check name
#data.stats

loading Assembly: Muscari
from saved path: ~/GBS/Muscari/Mus_Assembly/Muscari.json


## 1. Assembly for Phylogenetic analysis
#### *But first lets exclude samples with low read number (< 1000 reads after step 6), which are outsite the target group or with odd placements in preliminary analysis:*

**Samples outsite the target group are:**
- ...

In [None]:
## exclude samples from assembly with ...
keep_list = [i for i in data.samples.keys() if i not in [
    ## ... low read number (< 5000 )
    #"", "",
    
    ## ... other samples to exclude
    "", "", "",
]]

## make a new data branch from the keep_list
data = data.branch("data", subsamples = keep_list, force = True)

## double check taxon sampling
#data.stats.sort_values(by=['reads_consens'])
data.stats

In [None]:
################################################################
#############    TEMPLATE :::: do not run    ###################
################################################################

## ::: Template for step 7 assembly with in- and outgroup ::: ##
## create a branch for outputs with min_samples = x
min4 = data.branch("min_4")
min4.set_params("min_samples_locus", 4)
min4.run("7")

## ::: Template for step 7 assembly with in- and outgroup ::: ##
## create a branch for outputs with min_samples = x BUT only for ingroup
pops = data.branch("pops")
pops.population = {
    "ingroup": (20, [i for i in pops.samples if "Frai" in i]),
    "outgroup": (0, [i for i in pops.samples if "Frai" not in i])
}
pops.run("7", force = True)

################################################################
#############    TEMPLATE :::: do not run    ###################
################################################################

In [265]:
## ::::::: WORK IN PROGRESS
## WRITE THE RESULTS OF THE PERCENTAGE LOOP INTO A DICTIONARY 
## WHICH THEN CAN BE USED IN THE FOLLOWING STEPS
## INSTEAD OF MAKING THE DICTIONARY BY HAND 


## first check number of remaining samples
ingroup = data.stats.state.count() - 4
print("Number of ingroup taxa:", ingroup)
print("Calculate different sets of missing data:")

## for loop to calculate different values for min_sample_locus
percent = [10, 15, 20, 25, 30, 35, 40]
for i in percent:
    res = ingroup / 100 * i
    print(i,"% = ", round(res))

Number of ingroup taxa: 39
Calculate different sets of missing data:
10 % =  4
15 % =  6
20 % =  8
25 % =  10
30 % =  12
35 % =  14
40 % =  16


In [266]:
## Run the final assembly step 7 through for loop with different min_sample_locus
## based on estimated number of remaining samples MINUS outgroup

## make a dictionary with the percentage of missing data as keys and 
## the actual min_sample_locus specified as values based on the number of "ingroup samples"
sample_dict = {10: 4,
               15: 6,
               20: 8,
               25: 10,
               30: 12,
               35: 14,
               40: 16}

## define list with ingroup wildcards
#ingroup = ["Mus", "Pseu", "Leop"]

## loop over the dictionary 
for key, value in sample_dict.items():
    newname = "pops_{}".format(key)
    newdata = data.branch(newname)
    newdata.populations = {
        "ingroup":  (value, [i for i in newdata.samples if "B" not in i]),
        "outgroup": (0,     [i for i in newdata.samples if "B" in i]),
         }
    
    newdata.run("7", force = True)

Parallel connection | Cryptantha: 48 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:17 | building arrays      | s7 |
[####################] 100% 0:00:07 | writing conversions  | s7 |
[####################] 100% 0:00:17 | indexing vcf depths  | s7 |
[####################] 100% 0:00:48 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 48 cores
[####################] 100% 0:00:03 | applying filters     | s7 |
[####################] 100% 0:00:11 | building arrays      | s7 |
[####################] 100% 0:00:04 | writing conversions  | s7 |
[####################] 100% 0:00:07 | indexing vcf depths  | s7 |
[####################] 100% 0:00:29 | writing vcf output   | s7 |
Parallel connection | Cryptantha: 48 cores
[####################] 100% 0:00:03 | applying filters     | s7 |
[####################] 100% 0:00:07 | building arrays      | s7 |
[####################] 100% 0:00:03 | writing conversions  | s7 |
[############

In [None]:
## Does the same as above but without ingroup and outgroup
sample_dict = {10: 4,
               15: 6,
               20: 8,
               25: 10,
               30: 11,
               35: 13,
               40: 15}

## loop over the dictionary 
for key, value in sample_dict.items():
    newname = "min_{}".format(key)
    newdata = data.branch(newname)
    newdata.set_params("min_samples_locus", value)
    newdata.run("7", force = True)

### Phylogenetic downstream analysis
First, check if you need to install additional packages which are not included in the ipyrad package dependencies. Use the following commands to install the packages in the terminal.

In [None]:
## following programs are required
# conda install toytree -c eaton-lab
# conda install tetrad -c eaton-lab -c conda-forge
# conda install raxml -c bioconda

#### RAxML

In [None]:
## create a raxml analysis object for the Backbone tree
rax = ipa.raxml(
    name = Cris_pops30.name,
    data = Cris_pops30.outfiles.phy,
    workdir = "./Mus_Analysis/Mus_RAxML",
    T = 16,
    N = 200,
    o = "Bellevallia_pycantha_ED1272",
    )

In [None]:
## Plot the resulting tree

tre = toytree.tree("./Mus_Analysis/Mus_IQtree/pops_30.phy.contree")
rtre = tre.root(wildcard = "Belle")
#rtre.draw(tip_labels_align=True, node_labels="support")

# use canvas and axes function in order use export function
canvas, axes, mark = rtre.ladderize(1).draw(
    width = 1400,
    height = 900,
    use_edge_length = False,
    tip_labels_align = True,
    node_labels='support',
    node_sizes=0,
    node_labels_style={"font-size": "16px",
                       "baseline-shift": "7px",
                       "-toyplot-anchor-shift": "-13px"},
    );

In [60]:
## Plot all three RAxML trees together

## Load trees
tre15 = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_15.phy")
tre20 = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_20.phy")
tre25 = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_25.phy")
tre30 = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_30.phy")
tre35 = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_35.phy")
tre40 = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_40.phy")

tre15 = tre15.root(wildcard = "Brimeura")
tre20 = tre20.root(wildcard = "Brimeura")
tre25 = tre25.root(wildcard = "Brimeura")
tre30 = tre30.root(wildcard = "Brimeura")
tre35 = tre35.root(wildcard = "Brimeura")
tre40 = tre40.root(wildcard = "Brimeura")


## set dimensions of the canvas
canvas = toyplot.Canvas(width = 2000, height = 2000)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('2%',  '30%', '5%',  '47.5%'))
ax1 = canvas.cartesian(bounds=('33%', '63%', '5%',  '47.5%'))
ax2 = canvas.cartesian(bounds=('66%', '96%', '5%',  '47.5%'))
ax3 = canvas.cartesian(bounds=('2%',  '30%', '50%', '97.5%'))
ax4 = canvas.cartesian(bounds=('33%', '63%', '50%', '97.5%'))
ax5 = canvas.cartesian(bounds=('66%', '96%', '50%', '97.5%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {
    "tip_labels_align": True,
    "tip_labels_style": {"font-size": "11px"},
    "node_labels_style":{"font-size": "12px",
                        "baseline-shift": "7px",
                        "-toyplot-anchor-shift": "-13px"},
}
tre15.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre20.ladderize(1).draw(
    axes = ax1,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre25.ladderize(1).draw(
    axes = ax2,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre30.ladderize(1).draw(
    axes = ax3,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre35.ladderize(1).draw(
    axes = ax4,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tre40.ladderize(1).draw(
    axes = ax5,
    **style,
    node_sizes = 0,
    node_labels = 'support');

# hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False; ax2.show = False;
ax3.show = False; ax4.show = False; ax5.show = False;

In [61]:
import toyplot.pdf
toyplot.pdf.render(canvas, "/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/RAxML_Figures/Muscari_20210802_15-20-25-30-35-40_NoRoot.pdf");

#### tetRAD
##### run a single tetRAD analysis

In [36]:
# the path to your sequence data in HDF5 format
data = "/home/tim/GBS/Muscari/Mus_Assembly/pops_15_outfiles/pops_15.snps.hdf5"

In [12]:
# init analysis object with input data and (optional) parameter options
tet = ipa.tetrad(
    name = "Mus_pops_15",
    data = data,
    workdir = "./Mus_Analysis/Mus_tetRAD",
    nquartets = 1e6,
    nboots = 200,
)

loading snps array [44 taxa x 114197 snps]
max unlinked SNPs per quartet [nloci]: 14705
quartet sampler [full]: 135751 / 135751


In [13]:
tet.run(auto = True, force = True)

Parallel connection | Cryptantha: 64 cores
initializing quartet sets database
[####################] 100% 0:00:07 | full tree * | avg SNPs/qrt: 1014 
[####################] 100% 0:00:04 | boot rep. 1 | avg SNPs/qrt: 1017 
Keyboard Interrupt by user



###### run multiple retRAD analysis in a for loop

In [3]:
dict = {
    "pop15": "/home/tim/GBS/Muscari/Mus_Assembly/pops_15_outfiles/pops_15.snps.hdf5",
    "pop20": "/home/tim/GBS/Muscari/Mus_Assembly/pops_20_outfiles/pops_20.snps.hdf5",
    "pop25": "/home/tim/GBS/Muscari/Mus_Assembly/pops_25_outfiles/pops_25.snps.hdf5",
    "pop30": "/home/tim/GBS/Muscari/Mus_Assembly/pops_30_outfiles/pops_30.snps.hdf5",
    "pop35": "/home/tim/GBS/Muscari/Mus_Assembly/pops_35_outfiles/pops_35.snps.hdf5",
    "pop40": "/home/tim/GBS/Muscari/Mus_Assembly/pops_40_outfiles/pops_40.snps.hdf5"
}

In [4]:
#test = [pops15, pops20, pops25, pops30, pops35, pops40]

for key, value in dict.items():
    tet = ipa.tetrad(
        name = "Mus_tet_" + str(key),
        data = value,
        workdir = "./Mus_Analysis/Mus_tetRAD",
        nquartets = 1e6,
        nboots = 200)
    ## run 
    tet.run(auto = True, force = True)

loading snps array [43 taxa x 110524 snps]
max unlinked SNPs per quartet [nloci]: 14156
quartet sampler [full]: 123410 / 123410
Parallel connection | Cryptantha: 64 cores
initializing quartet sets database
[####################] 100% 0:00:07 | full tree * | avg SNPs/qrt: 1057 
[####################] 100% 0:00:03 | boot rep. 1 | avg SNPs/qrt: 1035 
[####################] 100% 0:00:04 | boot rep. 2 | avg SNPs/qrt: 1043 
[####################] 100% 0:00:03 | boot rep. 3 | avg SNPs/qrt: 1074 
[####################] 100% 0:00:03 | boot rep. 4 | avg SNPs/qrt: 1103 
[####################] 100% 0:00:04 | boot rep. 5 | avg SNPs/qrt: 1063 
[####################] 100% 0:00:04 | boot rep. 6 | avg SNPs/qrt: 1031 
[####################] 100% 0:00:03 | boot rep. 7 | avg SNPs/qrt: 1054 
[####################] 100% 0:00:04 | boot rep. 8 | avg SNPs/qrt: 1029 
[####################] 100% 0:00:04 | boot rep. 9 | avg SNPs/qrt: 959 
[####################] 100% 0:00:04 | boot rep. 10 | avg SNPs/qrt: 1000 
[#

[####################] 100% 0:00:02 | boot rep. 16 | avg SNPs/qrt: 1028 
[####################] 100% 0:00:02 | boot rep. 17 | avg SNPs/qrt: 1039 
[####################] 100% 0:00:02 | boot rep. 18 | avg SNPs/qrt: 994 
[####################] 100% 0:00:02 | boot rep. 19 | avg SNPs/qrt: 1037 
[####################] 100% 0:00:02 | boot rep. 20 | avg SNPs/qrt: 1038 
[####################] 100% 0:00:02 | boot rep. 21 | avg SNPs/qrt: 1045 
[####################] 100% 0:00:02 | boot rep. 22 | avg SNPs/qrt: 1080 
[####################] 100% 0:00:02 | boot rep. 23 | avg SNPs/qrt: 1036 
[####################] 100% 0:00:02 | boot rep. 24 | avg SNPs/qrt: 1064 
[####################] 100% 0:00:02 | boot rep. 25 | avg SNPs/qrt: 993 
[####################] 100% 0:00:02 | boot rep. 26 | avg SNPs/qrt: 1038 
[####################] 100% 0:00:02 | boot rep. 27 | avg SNPs/qrt: 1015 
[####################] 100% 0:00:02 | boot rep. 28 | avg SNPs/qrt: 1033 
[####################] 100% 0:00:02 | boot rep. 29 | 

[####################] 100% 0:00:02 | boot rep. 34 | avg SNPs/qrt: 997 
[####################] 100% 0:00:02 | boot rep. 35 | avg SNPs/qrt: 1007 
[####################] 100% 0:00:02 | boot rep. 36 | avg SNPs/qrt: 1040 
[####################] 100% 0:00:02 | boot rep. 37 | avg SNPs/qrt: 1034 
[####################] 100% 0:00:02 | boot rep. 38 | avg SNPs/qrt: 1019 
[####################] 100% 0:00:02 | boot rep. 39 | avg SNPs/qrt: 1024 
[####################] 100% 0:00:02 | boot rep. 40 | avg SNPs/qrt: 1071 
[####################] 100% 0:00:02 | boot rep. 41 | avg SNPs/qrt: 1038 
[####################] 100% 0:00:02 | boot rep. 42 | avg SNPs/qrt: 1014 
[####################] 100% 0:00:02 | boot rep. 43 | avg SNPs/qrt: 1059 
[####################] 100% 0:00:02 | boot rep. 44 | avg SNPs/qrt: 1042 
[####################] 100% 0:00:02 | boot rep. 45 | avg SNPs/qrt: 1044 
[####################] 100% 0:00:02 | boot rep. 46 | avg SNPs/qrt: 984 
[####################] 100% 0:00:02 | boot rep. 47 | 

[####################] 100% 0:00:02 | boot rep. 53 | avg SNPs/qrt: 999 
[####################] 100% 0:00:02 | boot rep. 54 | avg SNPs/qrt: 1034 
[####################] 100% 0:00:02 | boot rep. 55 | avg SNPs/qrt: 1028 
[####################] 100% 0:00:02 | boot rep. 56 | avg SNPs/qrt: 1014 
[####################] 100% 0:00:02 | boot rep. 57 | avg SNPs/qrt: 1043 
[####################] 100% 0:00:02 | boot rep. 58 | avg SNPs/qrt: 1032 
[####################] 100% 0:00:02 | boot rep. 59 | avg SNPs/qrt: 1052 
[####################] 100% 0:00:02 | boot rep. 60 | avg SNPs/qrt: 1085 
[####################] 100% 0:00:02 | boot rep. 61 | avg SNPs/qrt: 1016 
[####################] 100% 0:00:02 | boot rep. 62 | avg SNPs/qrt: 973 
[####################] 100% 0:00:02 | boot rep. 63 | avg SNPs/qrt: 1025 
[####################] 100% 0:00:02 | boot rep. 64 | avg SNPs/qrt: 1006 
[####################] 100% 0:00:01 | boot rep. 65 | avg SNPs/qrt: 996 
[####################] 100% 0:00:02 | boot rep. 66 | a

[####################] 100% 0:00:02 | boot rep. 72 | avg SNPs/qrt: 997 
[####################] 100% 0:00:02 | boot rep. 73 | avg SNPs/qrt: 1020 
[####################] 100% 0:00:02 | boot rep. 74 | avg SNPs/qrt: 1040 
[####################] 100% 0:00:02 | boot rep. 75 | avg SNPs/qrt: 954 
[####################] 100% 0:00:02 | boot rep. 76 | avg SNPs/qrt: 939 
[####################] 100% 0:00:02 | boot rep. 77 | avg SNPs/qrt: 997 
[####################] 100% 0:00:02 | boot rep. 78 | avg SNPs/qrt: 981 
[####################] 100% 0:00:02 | boot rep. 79 | avg SNPs/qrt: 996 
[####################] 100% 0:00:02 | boot rep. 80 | avg SNPs/qrt: 1007 
[####################] 100% 0:00:02 | boot rep. 81 | avg SNPs/qrt: 1014 
[####################] 100% 0:00:02 | boot rep. 82 | avg SNPs/qrt: 1006 
[####################] 100% 0:00:02 | boot rep. 83 | avg SNPs/qrt: 980 
[####################] 100% 0:00:02 | boot rep. 84 | avg SNPs/qrt: 1020 
[####################] 100% 0:00:02 | boot rep. 85 | avg S

[####################] 100% 0:00:02 | boot rep. 92 | avg SNPs/qrt: 989 
[####################] 100% 0:00:02 | boot rep. 93 | avg SNPs/qrt: 965 
[####################] 100% 0:00:02 | boot rep. 94 | avg SNPs/qrt: 980 
[####################] 100% 0:00:02 | boot rep. 95 | avg SNPs/qrt: 1010 
[####################] 100% 0:00:02 | boot rep. 96 | avg SNPs/qrt: 1012 
[####################] 100% 0:00:02 | boot rep. 97 | avg SNPs/qrt: 1022 
[####################] 100% 0:00:02 | boot rep. 98 | avg SNPs/qrt: 969 
[####################] 100% 0:00:02 | boot rep. 99 | avg SNPs/qrt: 968 
[####################] 100% 0:00:02 | boot rep. 100 | avg SNPs/qrt: 1003 
[####################] 100% 0:00:02 | boot rep. 101 | avg SNPs/qrt: 943 
[####################] 100% 0:00:02 | boot rep. 102 | avg SNPs/qrt: 1018 
[####################] 100% 0:00:02 | boot rep. 103 | avg SNPs/qrt: 966 
[####################] 100% 0:00:01 | boot rep. 104 | avg SNPs/qrt: 980 
[####################] 100% 0:00:02 | boot rep. 105 | 

In [5]:
## Plot all six tetRAD coalescent trees together

## Load trees
tet15 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop15.tree.cons").root(wildcard = "Brimeura")
tet20 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop20.tree.cons").root(wildcard = "Brimeura")
tet25 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop25.tree.cons").root(wildcard = "Brimeura")
tet30 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree.cons").root(wildcard = "Brimeura")
tet35 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop35.tree.cons").root(wildcard = "Brimeura")
tet40 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop40.tree.cons").root(wildcard = "Brimeura")

## set dimensions of the canvas
canvas = toyplot.Canvas(width = 2000, height = 2000)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('2%',  '30%', '5%',  '47.5%'))
ax1 = canvas.cartesian(bounds=('33%', '63%', '5%',  '47.5%'))
ax2 = canvas.cartesian(bounds=('66%', '96%', '5%',  '47.5%'))
ax3 = canvas.cartesian(bounds=('2%',  '30%', '50%', '97.5%'))
ax4 = canvas.cartesian(bounds=('33%', '63%', '50%', '97.5%'))
ax5 = canvas.cartesian(bounds=('66%', '96%', '50%', '97.5%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {
    "tip_labels_align": True,
    "tip_labels_style": {"font-size": "11px"},
    "node_labels_style":{"font-size": "12px",
                        "baseline-shift": "7px",
                        "-toyplot-anchor-shift": "-13px"},
}
tet15.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tet20.ladderize(1).draw(
    axes = ax1,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tet25.ladderize(1).draw(
    axes = ax2,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tet30.ladderize(1).draw(
    axes = ax3,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tet35.ladderize(1).draw(
    axes = ax4,
    **style,
    node_sizes = 0,
    node_labels = 'support');

tet40.ladderize(1).draw(
    axes = ax5,
    **style,
    node_sizes = 0,
    node_labels = 'support');

# hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False; ax2.show = False;
ax3.show = False; ax4.show = False; ax5.show = False;

In [8]:
import toyplot.pdf
toyplot.pdf.render(canvas, "/home/tim/GBS/Muscari/Mus_Analysis/Mus_tetRAD/tetRAD_Figures/Suppl-Fig_Mus_tet_20210811_15-20-25-30-35-40.pdf");

In [22]:
treeorder = ["Brimeura_amethystina_W6084", "Bellevalia_paradoxa_ED1272",
           "Bellevalia_dubia_W6083", "Bellevalia_speciosa_W6085",
           "Muscari_racemosum_ED1258", "Muscari_macrocarpum_ED1252",
           "Pseudomuscari_chalusicum_ED1255", "Pseudomuscari_azureum_ED1270",
           "Muscari_parviflorum_ED1245", "Pseudomuscari_inconstrictum_ED3234",
           "Muscari_commutatum_ED3538", "Muscari_sivrihisardaghlarensis_ED1278",
           "Muscari_anatolicum_W6087", "Muscari_vularlii_ED3232",
           "Muscari_discolor_ED1266", "Pseudomuscari_pallens_ED1267",
           "Pseudomuscari_coeruleum_ED1261", "Muscari_adilii_W6090",
           "Muscari_armeniacum_ED1244", "Muscari_armeniacum_W6089",
           "Muscari_neglectum_ED1253", "Muscari_baeticum_ED1281",
           "Muscari_botryoides_ED1279", "Muscari_neglectum_ED1254",
           "Muscari_pulchellum_ED3231", "Muscari_kerkis_ED1280",
           "Muscari_bourgaei_ED1259", "Muscari_latifolium_ED1265",
           "Leopoldia_tenuiflora_ED1263", "Leopoldia_longipes_ED3233",
           "Muscari_massayanum_ED1251", "Leopoldia_neumannii_ED1243",
           "Leopoldia_neumannii_ED1607", "Muscari_mirum_ED1250",
           "Leopoldia_matritensis_ED1282", "Leopoldia_spreitzenhoferi_ED1248",
           "Leopoldia_cycladica_W6082", "Leopoldia_weissii_W6081",
           "Leopoldia_caucasica_ED1262", "Leopoldia_comosa_ED3539",
           "Leopoldia_comosa_ED3965", "Leopoldia_comosa_ED1274", "Leopoldia_comosa_ED1256"]

In [23]:
## Load the 200 bootstrap trees from pops30 TetRad analysisis and root it
tetcloud30 = toytree.mtree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree.boots")
tetcloud30.treelist = [i.root(["Brimeura_amethystina_W6084"]) for i in tetcloud30.treelist]

## plot the rooted bootstrap trees as a cloud tree
canvas, axes, mark = tetcloud30.draw_cloud_tree(
    height = 600,
    width = 400,
    
    ## define a fix tree order to make it comparable with the cons tree
    fixed_order = treeorder,
    use_edge_lengths = False,
    edge_style = {"stroke-opacity": 0.05,
                  "stroke-width": 1}
);


In [37]:
## Load TetRad tree and consensus tree and root ith with Brimeura
constree30 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree.cons" ).root(wildcard = "Brimeura")

## Load TetRad bootstrap trees and root it with Brimeura
cloudtree30 = toytree.mtree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree.boots")
cloudtree30.treelist = [i.root(["Brimeura_amethystina_W6084"]) for i in cloudtree30.treelist]

## set dimensions of the canvas
canvas = toyplot.Canvas(width = 1300, height = 900)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('5%',  '47.5%', '5%',  '95%'))
ax1 = canvas.cartesian(bounds=('52.5%', '95%', '5%',  '95%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {"tip_labels_align": True,
         "tip_labels_style": {"font-size": "12px"},
         "node_labels_style":{"font-size": "12px",
                              "baseline-shift": "7px",
                              "-toyplot-anchor-shift": "-13px"},
}

cstyle = {"tip_labels_align": True,
          "layout": 'l',
          "tip_labels_style": {"font-size": "12px"},
          "node_labels_style":{"font-size": "12px",
                               "baseline-shift": "7px",
                               "-toyplot-anchor-shift": "-13px"},
}

constree30.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = constree.get_node_values("support"));

## plot the rooted bootstrap trees as a cloud tree
cloudtree30.draw_cloud_tree(
    axes = ax1,
    fixed_order = treeorder,  ## define a fix tree order to make it comparable with the cons tree
    **cstyle,
    use_edge_lengths = False,
    #tip_labels = False,
    edge_style = {"stroke-opacity": 0.05,
                  "stroke-width": 1}
);

# hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False;

In [38]:
import toyplot.pdf
toyplot.pdf.render(canvas, "/home/tim/GBS/Muscari/Mus_Analysis/Mus_tetRAD/tetRAD_Figures/Fig_Mus_tet_cons-cloud_20210811_pops30.pdf");

In [27]:
## Load TetRad tree and consensus tree and root ith with Brimeura
fulltree30 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree"      ).root(wildcard = "Brimeura")
constree30 = toytree.tree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree.cons" ).root(wildcard = "Brimeura")

## Load TetRad bootstrap trees and root it with Brimeura
cloudtree30 = toytree.mtree("./Mus_Analysis/Mus_tetRAD/Mus_tet_pop30.tree.boots")
cloudtree30.treelist = [i.root(["Brimeura_amethystina_W6084"]) for i in cloudtree30.treelist]

## set dimensions of the canvas
canvas = toyplot.Canvas(width = 1800, height = 900)

## dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
ax0 = canvas.cartesian(bounds=('2%',  '30%', '5%',  '97.5%'))
ax1 = canvas.cartesian(bounds=('33%', '61%', '5%',  '97.5%'))
ax2 = canvas.cartesian(bounds=('64%', '91%', '5%',  '97.5%'))

# call draw with the 'axes' argument to pass it to a specific cartesian area
style = {
    "tip_labels_align": True,
    "tip_labels_style": {"font-size": "12px"},
    "node_labels_style":{"font-size": "12px",
                        "baseline-shift": "7px",
                        "-toyplot-anchor-shift": "-13px"},
}
fulltree30.ladderize(1).draw(
    axes = ax0,
    **style,
    node_sizes = 0,
    node_labels = fulltree.get_node_values("support"));

constree30.ladderize(1).draw(
    axes = ax1,
    **style,
    node_sizes = 0,
    node_labels = constree.get_node_values("support"));

## plot the rooted bootstrap trees as a cloud tree
cloudtree30.draw_cloud_tree(
    axes = ax2,
    fixed_order = treeorder,  ## define a fix tree order to make it comparable with the cons tree
    **style,
    use_edge_lengths = False,
    edge_style = {"stroke-opacity": 0.05,
                  "stroke-width": 1}
);

# hide the axes (e.g, ticks and splines)
ax0.show = False; ax1.show = False; ax2.show = False;

## 2. Population analysis of Muscari with outgroups removed

In [6]:
## load assembly object when comming back
data = ip.load_json("./Mus_Assembly/Muscari.json")

## check name
#data.stats

loading Assembly: Muscari
from saved path: ~/GBS/Muscari/Mus_Assembly/Muscari.json


Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
Bellevalia_dubia_W6083,6,1030736,1029284,95294,20981,0.013,0.003,19338
Bellevalia_paradoxa_ED1272,6,1636142,1634727,108498,30267,0.013,0.003,27749
Bellevalia_speciosa_W6085,6,1416391,1414294,95536,25347,0.016,0.003,22957
Brimeura_amethystina_W6084,6,1554459,1551802,424844,28296,0.034,0.005,20711
Leopoldia_caucasica_ED1262,6,1462581,1461153,77305,22469,0.013,0.003,20653
Leopoldia_comosa_ED1256,6,1299389,1298312,90402,25831,0.015,0.003,23363
Leopoldia_comosa_ED1274,6,1464810,1463759,90898,24322,0.015,0.002,22075
Leopoldia_comosa_ED3539,6,2065757,2064748,368808,46895,0.013,0.003,42303
Leopoldia_comosa_ED3965,6,1232250,1231244,94455,25479,0.013,0.003,23472
Leopoldia_cycladica_W6082,6,1664161,1661171,152200,36680,0.025,0.003,30683


In [9]:
## exclude samples from assembly with ...
keep_list = [i for i in data.samples.keys() if i not in [
    ## ... low read number (< 5000 )
    "Bellevalia_dubia_W6083", "Bellevalia_paradoxa_ED1272",
    "Bellevalia_speciosa_W6085", "Brimeura_amethystina_W6084"
]]

## make a new data branch from the keep_list
nouts = data.branch("nouts", subsamples = keep_list, force = True)

## double check taxon sampling
#data.stats.sort_values(by=['reads_consens'])
#data.stats

Unnamed: 0,state,reads_raw,reads_passed_filter,clusters_total,clusters_hidepth,hetero_est,error_est,reads_consens
Leopoldia_caucasica_ED1262,6,1462581,1461153,77305,22469,0.013,0.003,20653
Leopoldia_comosa_ED1256,6,1299389,1298312,90402,25831,0.015,0.003,23363
Leopoldia_comosa_ED1274,6,1464810,1463759,90898,24322,0.015,0.002,22075
Leopoldia_comosa_ED3539,6,2065757,2064748,368808,46895,0.013,0.003,42303
Leopoldia_comosa_ED3965,6,1232250,1231244,94455,25479,0.013,0.003,23472
Leopoldia_cycladica_W6082,6,1664161,1661171,152200,36680,0.025,0.003,30683
Leopoldia_longipes_ED3233,6,1165697,1164740,78102,24618,0.014,0.003,22643
Leopoldia_matritensis_ED1282,6,1539833,1538882,125542,22566,0.008,0.003,21059
Leopoldia_neumannii_ED1243,6,1022622,1022000,74881,21887,0.008,0.003,20671
Leopoldia_neumannii_ED1607,6,1036009,1035263,69270,23509,0.009,0.003,22197


In [17]:
## run final assembly without outgroups and no missing data allowed for the ingroup
nouts.set_params("min_samples_locus", 20)
nouts.run("7", force = True)

Parallel connection | Cryptantha: 48 cores
[####################] 100% 0:00:02 | applying filters     | s7 |
[####################] 100% 0:00:03 | building arrays      | s7 |
[####################] 100% 0:00:01 | writing conversions  | s7 |
[####################] 100% 0:00:01 | indexing vcf depths  | s7 |
[####################] 100% 0:00:05 | writing vcf output   | s7 |


In [71]:
## load the hdf5 data for the STRUCTURE analysis
data = "/home/tim/GBS/Muscari/Mus_Assembly/nouts_outfiles/nouts.snps.hdf5"

In [34]:
# group individuals into populations
imap = {
    "Leop": ["Leopoldia_tenuiflora_ED1263", "Muscari_massayanum_ED1251", "Leopoldia_longipes_ED3233", 
             "Leopoldia_neumannii_ED1243", "Leopoldia_neumannii_ED1607", "Muscari_mirum_ED1250",
             "Leopoldia_caucasica_ED1262", "Leopoldia_matritensis_ED1282", "Leopoldia_comosa_ED3539",
             "Leopoldia_comosa_ED1274", "Leopoldia_comosa_ED3965", "Leopoldia_comosa_ED1256",
             "Leopoldia_weissii_W6081", "Leopoldia_weissii_ED1608", "Leopoldia_cycladica_W6082",
             "Leopoldia_spreitzenhoferi_ED1248"],
    "Musc": ["Pseudomuscari_pallens_ED1267", "Pseudomuscari_coeruleum_ED1261", 
             "Muscari_sivrihisardaghlarensis_ED1278", "Muscari_anatolicum_W6087", "Muscari_vularlii_ED3232",
             "Muscari_discolor_ED1266", "Muscari_adilii_W6090", "Muscari_armeniacum_ED1244", 
             "Muscari_armeniacum_W6089", "Muscari_neglectum_ED1253", "Muscari_neglectum_ED1254",
             "Muscari_baeticum_ED1281", "Muscari_botryoides_ED1279", "Muscari_commutatum_ED3538"],
    "Pull": ["Muscari_pulchellum_ED3231", "Muscari_kerkis_ED1280", "Muscari_bourgaei_ED1259", "Muscari_latifolium_ED1265"],
    "Pseu": ["Pseudomuscari_chalusicum_ED1255", "Pseudomuscari_inconstrictum_ED3234",
             "Pseudomuscari_azureum_ED1270", "Muscari_parviflorum_ED1245"],
    "Mosc": ["Muscari_racemosum_ED1258", "Muscari_macrocarpum_ED1252"],
}

# require that 50% of samples have data in each group
minmap = {i: 0.5 for i in imap}

In [72]:
struct = ipa.structure(
    name = "Mus_STRUC",
    data = data,
    imap = imap,
    minmap = minmap,
    mincov = 0.5,
    workdir = "./Mus_Analysis/Mus_Structure"
)

Samples: 40
Sites before filtering: 20294
Filtered (indels): 1423
Filtered (bi-allel): 2541
Filtered (mincov): 16097
Filtered (minmap): 12309
Filtered (subsample invariant): 9
Filtered (minor allele frequency): 0
Filtered (combined): 17017
Sites after filtering: 3280
Sites containing missing values: 2859 (87.16%)
Missing values in SNP matrix: 7277 (5.55%)
SNPs (total): 3280
SNPs (unlinked): 449


#### Run STRUCTURE and plot results
The `burnin` and `numreps` parameters determine the length of the run.

In [74]:
struct.mainparams.burnin  = 100000
struct.mainparams.numreps = 500000

## see all mainparams
print(struct.mainparams)

#see or ser extraparams
print(struct.extraparams)

burnin             100000              
extracols          0                   
label              1                   
locdata            0                   
mapdistances       0                   
markernames        0                   
markovphase        0                   
missing            -9                  
notambiguous       -999                
numreps            500000              
onerowperind       0                   
phased             0                   
phaseinfo          0                   
phenotype          0                   
ploidy             2                   
popdata            0                   
popflag            0                   
recessivealleles   0                   

admburnin           500                 
alpha               1.0                 
alphamax            10.0                
alphapriora         1.0                 
alphapriorb         2.0                 
alphapropsd         0.025               
ancestdist          0            

In [75]:
## set a range of k-values to test
kvalues = [2, 3, 4, 5, 6, 7]

In [78]:
## submit batches of 10 replicates jobs for each value of k
for kpop in kvalues:
    struct.run(kpop = kpop, nreps = 10, seed = 12345, ipyclient = ipyclient)#, force = True)

[####################] 100% 0:10:16 | running 10 structure jobs 
[####################] 100% 0:12:31 | running 10 structure jobs 
[####################] 100% 0:14:03 | running 10 structure jobs 
[####################] 100% 0:16:09 | running 10 structure jobs 
[####################] 100% 0:18:14 | running 10 structure jobs 
[####################] 100% 0:20:17 | running 10 structure jobs 


#### Analyze results: check results in evanno table

In [79]:
etable = struct.get_evanno_table(kvalues)
etable

Unnamed: 0,Nreps,lnPK,lnPPK,deltaK,estLnProbMean,estLnProbStdev
2,10,0.0,0.0,0.0,-5346.13,216.063
3,10,-2299.41,5288.81,0.602,-7645.54,8778.327
4,10,2989.4,6689.71,10.067,-4656.14,664.502
5,10,-3700.31,6784.16,1.135,-8356.45,5978.378
6,10,3083.85,2561.59,0.856,-5272.6,2990.892
7,10,522.26,0.0,0.0,-4750.34,1016.641


In [90]:
etable = struct.get_evanno_table(kvalues, max_var_multiple=100, quiet=True)
etable

Unnamed: 0,Nreps,lnPK,lnPPK,deltaK,estLnProbMean,estLnProbStdev
2,10,0.0,0.0,0.0,-5346.13,216.063
3,9,475.586,261.181,1.069,-4870.544,244.222
4,10,214.404,3914.714,5.891,-4656.14,664.502
5,10,-3700.31,6784.16,1.135,-8356.45,5978.378
6,10,3083.85,2561.59,0.856,-5272.6,2990.892
7,10,522.26,0.0,0.0,-4750.34,1016.641


#### Get permuted reps with CLUMPP

Calculate a permuted table of results across replicate runs for each value of K while excluding reps based on the max_var_multiple parameter

In [81]:
## summarize results
struct.clumppparams.m = 3                ## use largegreedy algorithm
struct.clumppparams.greedy_option = 2    ## test nrepeat possible orders
struct.clumppparams.repeats = 100000     ## number of repeats

In [92]:
qtable = struct.get_clumpp_table(kvalues, max_var_multiple=100.)

[K2] 10/10 results permuted across replicates (max_var=100.0).
[K3] 9/10 results permuted across replicates (max_var=100.0).
[K4] 10/10 results permuted across replicates (max_var=100.0).
[K5] 10/10 results permuted across replicates (max_var=100.0).
[K6] 10/10 results permuted across replicates (max_var=100.0).
[K7] 10/10 results permuted across replicates (max_var=100.0).


In [91]:
# get canvas object and set size
canvas = toyplot.Canvas(width=400, height=300)

# plot the mean log probability of the models in red
axes = canvas.cartesian(ylabel="estLnProbMean")
axes.plot(etable.estLnProbMean * -1, color="darkred", marker="o")
axes.y.spine.style = {"stroke": "darkred"}

# plot delta K with its own scale bar of left side and in blue
axes = axes.share("x", ylabel="deltaK", ymax=etable.deltaK.max() + etable.deltaK.max() * .25)
axes.plot(etable.deltaK, color="steelblue", marker="o");
axes.y.spine.style = {"stroke": "steelblue"}

# set x labels
axes.x.ticks.locator = toyplot.locator.Explicit(range(len(etable.index)), etable.index)
axes.x.label.text = "K (N ancestral populations)"

#### Analyze results: Barplots

In [145]:
k = 3
table = struct.get_clumpp_table(k)

[K3] 10/10 results permuted across replicates (max_var=0).


In [146]:
# sort list by columns
table.sort_values(by=list(range(k)), inplace=True)

# or, sort by a list of names (here taken from imap)
import itertools
onames = list(itertools.chain(*imap.values()))
table = table.loc[onames]

In [147]:
# build barplot
canvas = toyplot.Canvas(width=1000, height=500)
axes = canvas.cartesian(bounds=("10%", "90%", "10%", "45%"))
axes.bars(table)

# add labels to x-axis
ticklabels = [i for i in table.index.tolist()]
axes.x.ticks.locator = toyplot.locator.Explicit(labels=ticklabels)
axes.x.ticks.labels.angle = -60
axes.x.ticks.show = True
axes.x.ticks.labels.offset = 10
axes.x.ticks.labels.style = {"font-size": "12px"}

In [86]:
## Plot the resulting tree
## modified tree version with reduced branchlength for outgroup (Brimeura)
tre = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_30_STRUCT.phy")
rtre = tre.root(wildcard = "Brimeura")
          
# use canvas and axes function in order use export function
canvas, axes, mark = rtre.ladderize(1).draw(
    width = 1400,
    height = 900,
    #use_edge_length = False,
    tip_labels_align = True,
    node_labels='support',
    node_sizes=0,
    node_labels_style={"font-size": "15px",
                       "baseline-shift": "7px",
                       "-toyplot-anchor-shift": "-13px"},
    );

In [190]:
myorder = ["Muscari_racemosum_ED1258", "Muscari_macrocarpum_ED1252",
           "Pseudomuscari_chalusicum_ED1255", "Pseudomuscari_azureum_ED1270",
           "Pseudomuscari_inconstrictum_ED3234", "Muscari_parviflorum_ED1245",
           "Muscari_commutatum_ED3538", "Muscari_sivrihisardaghlarensis_ED1278",
           "Muscari_anatolicum_W6087", "Muscari_vularlii_ED3232",
           "Muscari_discolor_ED1266", "Pseudomuscari_coeruleum_ED1261",
           "Pseudomuscari_pallens_ED1267", "Muscari_armeniacum_ED1244",
           "Muscari_armeniacum_W6089", "Muscari_adilii_W6090",
           "Muscari_neglectum_ED1253", "Muscari_baeticum_ED1281",
           "Muscari_botryoides_ED1279", "Muscari_neglectum_ED1254",
           "Muscari_pulchellum_ED3231", "Muscari_kerkis_ED1280",
           "Muscari_latifolium_ED1265", "Muscari_bourgaei_ED1259",
           "Leopoldia_tenuiflora_ED1263", "Muscari_massayanum_ED1251",
           "Leopoldia_longipes_ED3233", "Leopoldia_neumannii_ED1607",
           "Leopoldia_neumannii_ED1243", "Muscari_mirum_ED1250",
           "Leopoldia_caucasica_ED1262", "Leopoldia_matritensis_ED1282",
           "Leopoldia_comosa_ED3539", "Leopoldia_comosa_ED1274",
           "Leopoldia_comosa_ED3965", "Leopoldia_comosa_ED1256",
           "Leopoldia_weissii_ED1608", "Leopoldia_spreitzenhoferi_ED1248",
           "Leopoldia_cycladica_W6082", "Leopoldia_weissii_W6081"]
print("custom ordering")
print(qtable[2].loc[myorder])

custom ordering
                                           0      1
Muscari_racemosum_ED1258               0.759  0.241
Muscari_macrocarpum_ED1252             0.751  0.249
Pseudomuscari_chalusicum_ED1255        0.382  0.618
Pseudomuscari_azureum_ED1270           0.396  0.604
Pseudomuscari_inconstrictum_ED3234     0.384  0.616
Muscari_parviflorum_ED1245             0.381  0.619
Muscari_commutatum_ED3538              0.574  0.426
Muscari_sivrihisardaghlarensis_ED1278  0.005  0.995
Muscari_anatolicum_W6087               0.005  0.995
Muscari_vularlii_ED3232                0.007  0.993
Muscari_discolor_ED1266                0.007  0.993
Pseudomuscari_coeruleum_ED1261         0.003  0.997
Pseudomuscari_pallens_ED1267           0.002  0.998
Muscari_armeniacum_ED1244              0.007  0.993
Muscari_armeniacum_W6089               0.007  0.993
Muscari_adilii_W6090                   0.002  0.998
Muscari_neglectum_ED1253               0.002  0.998
Muscari_baeticum_ED1281                0.002  0.

#### Plot all STRUCTURE results against Phylogeny

In [142]:
etable

Unnamed: 0,Nreps,lnPK,lnPPK,deltaK,estLnProbMean,estLnProbStdev
2,10,0.0,0.0,0.0,-5346.13,216.063
3,9,475.586,261.181,1.069,-4870.544,244.222
4,10,214.404,3914.714,5.891,-4656.14,664.502
5,10,-3700.31,6784.16,1.135,-8356.45,5978.378
6,10,3083.85,2561.59,0.856,-5272.6,2990.892
7,10,522.26,0.0,0.0,-4750.34,1016.641


In [238]:
## get tree from RAxML results
## modified tree version with reduced branchlength for outgroup (Brimeura)
tre = toytree.tree("/home/tim/GBS/Muscari/Mus_Analysis/Mus_RAxML/Mus_RAxML_20210802/RAxML_bipartitions.pops_30_STRUCT.phy")
rtre = tre.root(wildcard = "Brimeura")

## further styling of plot with css 
style = {"stroke":toyplot.color.near_black, 
         "stroke-width": 0.25}

##    y1
## x1    x2
##    y2

## built & dissect canvas into multiple cartesian areas (x1, x2, y1, y2)
c = toyplot.Canvas(width = 900, height = 700)
a1 = c.cartesian(bounds=('1%', '46.5%', '5%', '95%'))       # The tree
a2 = c.cartesian(bounds=('50.5%', '59%', '5.25%', '86.25%'))  # K=2
a3 = c.cartesian(bounds=('59.5%', '68%', '5.25%', '86.25%'))  # K=3
a4 = c.cartesian(bounds=('68.5%', '77%', '5.25%', '86.25%'))  # K=4
a5 = c.cartesian(bounds=('77.5%', '86%', '5.25%', '86.25%'))  # K=5
a6 = c.cartesian(bounds=('86.5%', '95%', '5.25%', '86.25%'))  # K=6
a1.show = False
a2.show = False
a3.show = False
a4.show = False
a5.show = False
a6.show = False

## draw the tree
rtre.ladderize(1).draw(
    axes = a1,
    use_edge_lengths = True,
    tip_labels_align = True,
    tip_labels_style = {"font-size": "9px"},
    node_labels = "support",
    node_sizes = 0,
    node_labels_style={"font-size": "9px",
                       "baseline-shift": "7px",
                       "-toyplot-anchor-shift": "-8px"});

## draw the STRUCTURE bar plots
## 'along' defines plot orientation; x = vertical; y = horizontal
a2.bars(qtable[2].loc[myorder], style = style, along = 'y');
a3.bars(qtable[3].loc[myorder], style = style, along = 'y');
a4.bars(qtable[4].loc[myorder], style = style, along = 'y');
a5.bars(qtable[5].loc[myorder], style = style, along = 'y');
a6.bars(qtable[6].loc[myorder], style = style, along = 'y');

## add header for the bar plots
c.text(495, 23, 'K = 2', style={"font-size": "13px"})
c.text(575, 23, 'K = 3', style={"font-size": "13px"})
c.text(655, 23, 'K = 4', style={"font-size": "13px"})
c.text(735, 23, 'K = 5', style={"font-size": "13px"})
c.text(815, 23, 'K = 6', style={"font-size": "13px"})

## add deltaK values below the bar plots
c.text(495, 615, '0.0', style={"font-size": "10px"})
c.text(575, 615, '1.1', style={"font-size": "10px"})
c.text(655, 615, '5.9', style={"font-size": "10px"})
c.text(735, 615, '1.1', style={"font-size": "10px"})
c.text(815, 615, '0.9', style={"font-size": "10px"})
c.text(655, 630, 'delta <b>K</b>', style={"font-size": "10px"});

In [253]:
import toyplot.pdf
toyplot.pdf.render(c, "/home/tim/GBS/Muscari/Mus_Analysis/FiguresForPaper/Mus_RAxML_STRUCTURE_20210811.pdf");

### PCA

In [250]:
# init pca object with input data and (optional) parameter options
pca = ipa.pca(
    data = data,
    imap = imap,
    minmap = minmap,
    mincov = 0.25,
    impute_method = "sample",
)

Samples: 40
Sites before filtering: 20294
Filtered (indels): 1423
Filtered (bi-allel): 2541
Filtered (mincov): 148
Filtered (minmap): 12309
Filtered (subsample invariant): 9
Filtered (minor allele frequency): 0
Filtered (combined): 13594
Sites after filtering: 6703
Sites containing missing values: 6282 (93.72%)
Missing values in SNP matrix: 37600 (14.02%)
SNPs (total): 6703
SNPs (unlinked): 979
Imputation: 'sampled'; (0, 1, 2) = 88.8%, 7.7%, 3.4%


In [251]:
# run the PCA analysis
pca.run()

Subsampling SNPs: 979/6703


In [248]:
# store the PC axes as a dataframe
df = pd.DataFrame(pca.pcaxes[0], index=pca.names)

# write the PC axes to a CSV file
df.to_csv("pca_analysis.csv")

# show the first ten samples and the first 10 PC axes
df.iloc[:10, :10].round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Leopoldia_caucasica_ED1262,-6.47,-2.69,-0.96,-0.91,-1.05,-0.66,-1.1,-1.07,0.06,0.33
Leopoldia_comosa_ED1256,-6.01,-2.63,-1.11,-1.45,-3.07,0.4,-3.12,-0.13,-0.55,1.0
Leopoldia_comosa_ED1274,-6.09,-3.05,-0.3,-1.11,-2.49,0.11,-2.42,-1.42,0.37,0.62
Leopoldia_comosa_ED3539,-6.32,-2.95,-1.26,-0.81,-2.17,0.18,-2.56,-0.98,-0.64,1.22
Leopoldia_comosa_ED3965,-6.21,-2.63,-1.04,-1.12,-1.92,0.46,-2.55,0.74,-0.65,0.68
Leopoldia_cycladica_W6082,-5.22,-2.4,-0.85,-1.27,-0.24,-0.18,-1.12,0.19,0.38,-0.88
Leopoldia_longipes_ED3233,-4.15,-0.61,1.08,0.59,1.02,-0.23,4.16,-3.48,0.01,1.25
Leopoldia_matritensis_ED1282,-5.81,-1.56,-0.71,-0.48,0.57,-0.04,0.7,-0.12,0.96,-2.15
Leopoldia_neumannii_ED1243,-4.97,-1.52,-1.77,0.28,2.31,0.05,3.54,6.91,-1.25,-0.26
Leopoldia_neumannii_ED1607,-4.73,-1.4,-1.23,0.57,3.11,0.61,4.25,7.04,-1.0,-0.73


In [255]:
pca.draw(0, 2);
pca.draw(0, 1);
toyplot.pdf.render("/home/tim/GBS/Muscari/Mus_Analysis/FiguresForPaper/Mus_PCA_20210811.pdf");

ValueError: Expected <class 'toyplot.canvas.Canvas'>, received <class 'str'>.

In [258]:
import toyplot.pdf

# save returned plot objects as variables
canvas, axes, mark = pca.draw(0, 2)

# pass the canvas object to toyplot render function
toyplot.pdf.render(canvas, "/home/tim/GBS/Muscari/Mus_Analysis/FiguresForPaper/Mus_PCA_20210811.pdf")


ValueError: not enough values to unpack (expected 3, got 2)

In [127]:
# plot PC axes 0 and 2
# get canvas object and set size
c = toyplot.Canvas(width=800, height=400)

## built & dissect canvas into multiple cartesian areas 
##    y1
## x1    x2
##    y2
a1 = c.cartesian(bounds=('5%', '47.5%', '5%', '95%'))
a2 = c.cartesian(bounds=('52.5%', '95%', '5%', '95%'))

a1.show = False
a2.show = False

a1.plot(pca1);
a2.plot(pca1);

#pca.draw(0, 2);

TypeError: float() argument must be a string or a number, not 'Canvas'

In [245]:
# init pca object with input data and (optional) parameter options
pca2 = ipa.pca(
    data=data,
    imap=imap,
    minmap=minmap,
    mincov=0.5,
    impute_method="sample",
)

# run and draw results for impute_method=None and mincov=1.0
pca2.run(nreplicates=25, seed=123)
pca2.draw(0, 2);
pca2.draw(0, 1);
pca2.draw(0, 3);


Samples: 40
Sites before filtering: 20294
Filtered (indels): 1423
Filtered (bi-allel): 2541
Filtered (mincov): 843
Filtered (minmap): 12309
Filtered (subsample invariant): 9
Filtered (minor allele frequency): 0
Filtered (combined): 13594
Sites after filtering: 6703
Sites containing missing values: 6282 (93.72%)
Missing values in SNP matrix: 37600 (14.02%)
SNPs (total): 6703
SNPs (unlinked): 979
Imputation: 'sampled'; (0, 1, 2) = 88.7%, 7.7%, 3.6%
Subsampling SNPs: 979/6703
