In [1]:
#import libraries
from IPython.core.display import display, HTML
from sequence_alignment import main_algorithm
from clustering import convert_to_distance_matrix, hierarchical_clustering
from hierarchical_validation import validation
from synthetic_data import compute_jump_matrix, ctmc_sequences
import itertools
import numpy as np
import pandas as pd

In [None]:
###############################################################
#             TNW ALGORITHM PARAMETERS
###############################################################

#pre-defined scoring system
match=1.
mismatch=-1.1
#initialize pre-defined scoring dictionary
s = {'OO': match}
#get all combinations of letters
comb = list(itertools.product('ABCDEFGHIJZ',repeat = 2))
#construct the pre-defined scoring system
for pairs in comb:
    if(pairs[0]==pairs[1]):
        s[pairs[0]+pairs[1]] = match
    else:
        s[pairs[0]+pairs[1]] = mismatch

#gap penalty
#gap_values = np.linspace(-1,1,21)
gap_values = [0.5]

#Temporal penalty
T = 0.25

In [None]:
#############################################################################
#       PARAMETERS FOR HIERARCHICAL CLUSTERING & VALIDATION
#############################################################################

#distance metric used in hierarchical clustering
method = 'ward'

#number of bootstrap samples M for validation step
M = 250

#number of maximum clusters to analyze on validation step
max_K = 5

In [30]:
###############################################################################
#           TEMPORAL SEQUENCE GENERATION - 2 SEQUENCES A->B
###############################################################################
#number of clusters
clusters = 2
# rates of the clusters
rates = [1000, 1]
#n_sequences/cluster
n_sequences = 5

#initialize list that will contain the dataframes to be concataneted
concat = [] 

#generate sequences
for i in range(0,clusters):
    
    alfa = [1,0] #initial distribution for the states
    Q = np.zeros((2,2)) # Q-matrix
    rate = rates[i]     #rate of the transition
    Q[0][0:2] = [-rate,rate] 
    P = compute_jump_matrix(Q)     #jump matrix
    df_aux = ctmc_sequences(5,alfa,Q,P,n_sequences) #temporal sequences
    concat.append(df_aux)

df_encoded = pd.concat(concat,ignore_index = True)
#numerate patients from 0 to N-1, where N is the number patients
df_encoded['id_patient'] = df_encoded.index.tolist()
df_encoded

Unnamed: 0,id_patient,aux_encode
0,0,"0.A,3222.92.B"
1,1,"0.A,1515.94.B"
2,2,"0.A,1566.29.B"
3,3,"0.A,1474.22.B"
4,4,"0.A,172.87.B"
5,5,"0.A,1.18.B"
6,6,"0.A,0.14.B"
7,7,"0.A,0.96.B"
8,8,"0.A,0.07.B"
9,9,"0.A,0.24.B"


In [None]:
###############################################################################
#            SEQUENCE ALIGNMENT, HIERARCHICAL CLUSTERING & VALIDATION
###############################################################################
for gap in gap_values:
    
    #pairwise sequence alignment results
    results = main_algorithm(df_encoded,gap,T,s,0)
    
    #reset indexes
    df_encoded = df_encoded.reset_index()
    
    #convert similarity matrix into distance matrix
    results['score'] = convert_to_distance_matrix(results['score'])
    
    #exception when all the scores are the same, in this case we continue with the next value of gap
    if((results['score']== 0).all()):
        continue
    else:
        Z = hierarchical_clustering(results['score'],method,gap)
        
        validation(M,df_encoded,results,Z,method,max_K+1)
    
