# Run STDR on the HIV sequences

Mamie Wang 2020/11/29

## 5007

In [1]:
import dendropy
import numpy as np
import sys, os

sys.path.append("../spectraltree")

import generation
import reconstruct_tree
import time
import utils
import pandas as pd
import argparse

In [2]:
def prepare_inputs(seqs):

    leafs_idx = [i.label[0] != " " for i in seqs.taxon_namespace]

    ch_list = list()
    for t in seqs.taxon_namespace:
        ch_list.append([x.symbol for x in seqs[t]])

    ch_list_num = np.array(ch_list)
    ch_list_num = ch_list_num[leafs_idx]
    ch_list_num = np.where(ch_list_num=='A', 0, ch_list_num) 
    ch_list_num = np.where(ch_list_num=='C', 1, ch_list_num) 
    ch_list_num = np.where(ch_list_num=='G', 2, ch_list_num) 
    ch_list_num = np.where(ch_list_num=='T', 3, ch_list_num) 
    ch_list_num = np.where(ch_list_num=='-', 4, ch_list_num) 
    ch_list_num = ch_list_num.astype('int')

    taxa = np.array(seqs.taxon_namespace._taxa)[leafs_idx]

    taxa_meta = utils.TaxaMetadata(seqs.taxon_namespace, list(taxa), alphabet=dendropy.DNA_STATE_ALPHABET)
    return(ch_list_num, taxa_meta)

In [3]:
seqs = dendropy.DnaCharacterMatrix.get(file=open("../data/11.24.20 5007 trimmed.fasta")
                                       , schema="fasta")

ch_list_num, taxa_meta = prepare_inputs(seqs)

In [4]:
spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        
start_time = time.time()
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_list_num, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = 6, min_split = 2, verbose = False)
end_time = time.time()
print(end_time - start_time)

print("5007")
tree_rec.print_plot()

0.033689022064208984
5007
          /--------------------------------------- YCHO-5007M34-Nesty8_043     
          |                                                                    
          |                   /------------------- YCHO-5007M36-Nesty8_094     
/---------+         /---------+                                                
|         |         |         |         /--------- YCHO-5007M33-Nesty8_010     
|         |         |         \---------+                                      
|         \---------+                   \--------- YCHO-5007M37-Nesty8_061     
|                   |                                                          
|                   |         /------------------- YCHO-5007B2-Nesty8_045      
+                   \---------+                                                
|                             |         /--------- YCHO-5007B1-Nesty8_028      
|                             \---------+                                      
|             

  self.explained_variance_ratio_ = exp_var / full_var


## 5009 

In [5]:
seqs = dendropy.DnaCharacterMatrix.get(file=open("../data/11.24.20 5009 trimmed.fasta")
                                       , schema="fasta")

ch_list_num, taxa_meta = prepare_inputs(seqs)

spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        
start_time = time.time()
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_list_num, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = 6, min_split = 2, verbose = False)
end_time = time.time()
print(end_time - start_time)

print(5009)
tree_rec.print_plot()

0.009598493576049805
5009
          /--------------------------------------- YCHO-11.23_SP-5009_M3_29    
/---------+                                                                    
|         |         /----------------------------- YCHO-11.23_SP-5009_M3_38    
|         \---------+                                                          
|                   |         /------------------- YCHO-11.23_SP-5009_M3_5     
|                   \---------+                                                
+                             |         /--------- YCHO-5009B13-Nesty8_091     
|                             \---------+                                      
|                                       \--------- YCHO-11.23_SP-5009_B_6      
|                                                                              
|                   /----------------------------- YCHO-5009B10-Nesty8_075     
|                   |                                                          
\-------------

## 5014

In [6]:
seqs = dendropy.DnaCharacterMatrix.get(file=open("../data/11.24.20 5014 trimmed.fasta"), schema="fasta")

ch_list_num, taxa_meta = prepare_inputs(seqs)

spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        
start_time = time.time()
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_list_num, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = 6, min_split = 2, verbose = False)
end_time = time.time()
print(end_time - start_time)

print(5014)
tree_rec.print_plot()

0.05663418769836426
5014
                                        /--------- YCHO-11.23_SP-5014_M3_13    
     /----------------------------------+                                      
     |                                  |    /---- YCHO-11.23_SP-5014_B_43     
     |                                  \----+                                 
/----+                                       \---- YCHO-11.23_SP-5014_M3_15    
|    |                                                                         
|    |    /--------------------------------------- YCHO-11.23_SP-5014_B_39     
|    \----+                                                                    
|         |    /---------------------------------- YCHO-11.23_SP-5014_B_27     
|         \----+                                                               
|              |    /----------------------------- YCHO-11.23_SP-5014_M3_40    
|              \----+                                                          
|              

## 5017

In [7]:
seqs = dendropy.DnaCharacterMatrix.get(file=open("../data/11.24.20 5017 trimmed.fasta")
                                       , schema="fasta")

ch_list_num, taxa_meta = prepare_inputs(seqs)

spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        
start_time = time.time()
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_list_num, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = 6, min_split = 2, verbose = False)
end_time = time.time()
print(end_time - start_time)

print(5017)
tree_rec.print_plot()

0.05739235877990723
5017
    /--------------------------------------------- YCHO-11.23_SP-5017_M3_20    
    |                                                                          
    |                                        /---- YCHO-11.23_SP-5017_M3_17    
/---+   /------------------------------------+                                 
|   |   |                                    \---- YCHO-11.23_SP-5017_B_3      
|   |   |                                                                      
|   \---+                                    /---- YCHO-11.23_SP-5017_M3_12    
|       |   /--------------------------------+                                 
|       |   |                                \---- YCHO-11.23_SP-5017_M3_26    
|       |   |                                                                  
|       \---+   /--------------------------------- YCHO-11.23_SP-5017_M3_11    
|           |   |                                                              
|           |  

## 5020

In [8]:
seqs = dendropy.DnaCharacterMatrix.get(file=open("../data/11.24.20 5020 trimmed.fasta")
                                       , schema="fasta")

ch_list_num, taxa_meta = prepare_inputs(seqs)

spectral_method = reconstruct_tree.SpectralTreeReconstruction(reconstruct_tree.NeighborJoining, reconstruct_tree.HKY_similarity_matrix)
        
start_time = time.time()
tree_rec = spectral_method.deep_spectral_tree_reconstruction(ch_list_num, reconstruct_tree.HKY_similarity_matrix, 
                                                            taxa_metadata = taxa_meta,
                                                            threshhold = 10, min_split = 2, verbose = False)
end_time = time.time()
print(end_time - start_time)

print(5020)
tree_rec.print_plot()

0.18118000030517578
5020
   /---------------------------------------------- YCHO-5020B2-Nesty8_077      
   |                                                                           
   |                                          /--- YCHO-11.23_SP-5020B_31      
   |     /------------------------------------+                                
   |     |                                    \--- YCHO-11.23_SP-5020_B_35     
   |  /--+                                                                     
   |  |  |  /------------------------------------- YCHO-11.23_SP-5020B36       
   |  |  |  |                                                                  
   |  |  \--+  /---------------------------------- YCHO-11.23_SP-5020_M3_20    
   |  |     |  |                                                               
   |  |     \--+  /------------------------------- YCHO-11.23_SP-5020M3_17     
   |  |        |  |                                                            
   |  |        

  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
