# <font color=green>deepBreaks Applications</font>
## Modeling spectral tuning sites of opsin proteins based on amino-acid sequence...  

# <font color=red>STEP 3: deepBreaks</font>
## THIS IS A LONG SECTION! 
### **Output** = folder containing all results from model training, including comparison of model performances, an amino-acid site importance report + figures, and the top 5 trained models in a .pkl file format.

In [8]:
# importing deepBreaks libraries 
from deepBreaks.utils import get_models, get_scores, get_params, make_pipeline
from deepBreaks.preprocessing import MisCare, ConstantCare, URareCare, CustomOneHotEncoder, AminoAcidPropertyEncoder
from deepBreaks.preprocessing import FeatureSelection, CollinearCare
from deepBreaks.preprocessing import read_data
from deepBreaks.models import model_compare_cv, finalize_top, importance_from_pipe, mean_importance, summarize_results
from deepBreaks.visualization import plot_scatter, dp_plot, plot_imp_model, plot_imp_all
from deepBreaks.preprocessing import write_fasta
import warnings
import datetime
import os
import shutil 

In [9]:
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

In [56]:
# defining user params, file pathes, analysis type

#assign your path to folder containing all the datasplits
path = './vpod_1.1_data_splits_2024-05-02_16-58-09'
# path to sequences of interest
seqFileName = f'{path}/wt_aligned_VPOD_1.1_het.fasta' 
# path to corresponding metadata of interest
metaDataFileName = f'{path}/wt_meta.tsv' 

# name of the phenotype
mt = 'Lambda_Max'

# type of the sequences
seq_type = 'aa'

# type of the analysis if it is a classification model, then we put cl instead of reg
ana_type = 'reg' 

gap_threshold = 0.50

#Whether or not you want to drop the reference sequence from the training data- Usually 'Bovine' or 'Squid'
drop_ref = False


In [57]:
props_to_keep = ['H1', 'H3', 'NCI']

In [58]:
# making a unique directory for saving the reports of the analysis
print('direcory preparation')
dt_label = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
seqFile = seqFileName.split('/')[2]
#print(seqFile)
seqFile = seqFile.split('.')[0]
#print(seqFile)
report_dir = str(seqFile +'_' + mt + '_' + dt_label)
os.makedirs(report_dir)

direcory preparation


In [59]:
%%time
print('reading meta-data')
# importing metadata
meta_data = read_data(metaDataFileName, seq_type = None, is_main=False)
# importing sequences data
print('reading fasta file')

tr = read_data(seqFileName, seq_type = seq_type, is_main=True, gap_threshold=gap_threshold)

shutil.copy2(f'{seqFileName}',report_dir)
write_fasta(dat = tr, fasta_file = f'{seqFile}_gap_dropped.fasta' , report_dir = report_dir)

reading meta-data
reading fasta file
rod_aligned_VPOD_1_gap_dropped.fasta was saved successfully
CPU times: total: 141 ms
Wall time: 147 ms


In [60]:
try:
    reference_seq = tr.loc['Bovine'].copy()
    ref_seq_name = 'bovine'
    if drop_ref == True:
        meta_data = meta_data.drop('Bovine')
    #print(bovine)
except:
    reference_seq = tr.loc['Squid'].copy()
    ref_seq_name = 'squid'
    #print(squid)
reference_seq.to_csv(path_or_buf= f'{report_dir}/ref_sequence.csv',index = True,mode="w")

In [61]:
tr = tr.merge(meta_data.loc[:, mt],  left_index=True, right_index=True)
tr.shape

(396, 353)

In [62]:
tr.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p344,p345,p346,p347,p348,p349,p350,p351,p352,Lambda_Max
Bovine,M,N,G,T,E,G,P,N,F,Y,...,,,T,S,Q,V,A,P,A,500.0
S1,M,N,G,T,E,G,P,Y,F,Y,...,S,S,,S,S,V,S,P,A,502.0
S2,,,,T,E,G,P,Y,F,Y,...,S,S,,S,S,V,S,P,A,502.0
S3,,,,T,E,G,P,D,F,Y,...,S,S,,S,S,V,S,P,A,481.0
S4,,,,T,E,G,P,F,F,Y,...,S,S,,S,S,V,S,P,A,494.0
S5,,,,T,E,G,P,Y,F,Y,...,S,S,,S,S,V,S,P,A,494.0
S6,M,N,G,T,E,G,P,F,F,Y,...,S,S,,S,S,V,S,P,A,491.0
S7,,,,T,E,G,P,Y,F,Y,...,S,S,,S,S,V,S,P,A,486.0
S8,,,,T,E,G,P,D,F,Y,...,S,S,,S,S,V,S,P,A,490.0
S9,,,,T,E,G,P,D,F,Y,...,S,S,,S,S,V,S,P,A,490.0


In [25]:
tr = tr.reindex(tip_to_fold.keys())

In [26]:
tr.head(10)

Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p346,p347,p348,p349,p350,p351,p352,p353,p354,Lambda_Max
Bovine,M,,N,G,T,E,G,P,N,F,...,,,,S,Q,V,A,P,A,500.0
S1,M,,N,G,T,E,G,P,Y,F,...,S,A,,S,S,V,S,P,A,502.0
S2,,,,,T,E,G,P,Y,F,...,S,A,,S,S,V,S,P,A,502.0
S4,,,,,T,E,G,P,F,F,...,S,A,,S,S,V,S,P,A,494.0
S3,,,,,T,E,G,P,D,F,...,S,A,,S,S,V,S,P,A,481.0
S8,,,,,T,E,G,P,D,F,...,S,A,,S,S,V,S,P,A,490.0
S9,,,,,T,E,G,P,D,F,...,S,A,,S,S,V,S,P,A,490.0
S5,,,,,T,E,G,P,Y,F,...,S,A,,S,S,V,S,P,A,494.0
S6,M,,N,G,T,E,G,P,F,F,...,S,A,,S,S,V,S,P,A,491.0
S7,,,,,T,E,G,P,Y,F,...,S,A,,S,S,V,S,P,A,486.0


In [27]:
import numpy as np
import csv
import pandas as pd

In [63]:
y = tr.loc[:, mt].values
tr.drop(mt, axis=1, inplace=True)
print('Shape of data is: ', tr.shape)

Shape of data is:  (396, 352)


In [64]:
y_wv = 1e7 / np.array(y)


**Attention**: metadata and sequences data should have the names as their row names and for each sequence their must be a value in the meta data file.

In [65]:
print('metadata looks like this:')
meta_data.head(10)

metadata looks like this:


Unnamed: 0,Lambda_Max,Species,Opsin_Family,Phylum,Class,Accession,Mutations
Bovine,500.0,Bos_tarus,Rh1,Chordata,Mammalia,NM_001014890,
S1,502.0,Neoniphon_sammara,Rh1,Chordata,Actinopteri,U57536.1,
S2,502.0,Neoniphon_argenteus,Rh1,Chordata,Actinopteri,U57540.1,
S3,481.0,Neoniphon_aurolineatus,Rh1,Chordata,Actinopteri,U57541.1,
S4,494.0,Sargocentron_punctatissimum,Rh1,Chordata,Actinopteri,U57543.1,
S5,494.0,Sargocentron_microstoma,Rh1,Chordata,Actinopteri,U57542.1,
S6,491.0,Sargocentron_diadema,Rh1,Chordata,Actinopteri,U57537.1,
S7,486.0,Sargocentron_xantherythrum,Rh1,Chordata,Actinopteri,U57546.1,
S8,490.0,Sargocentron_spiniferum,Rh1,Chordata,Actinopteri,U57544.1,
S9,490.0,Sargocentron_tiere,Rh1,Chordata,Actinopteri,U57545.1,


In [66]:
print('sequence data looks like this:')
tr.head(10)

sequence data looks like this:


Unnamed: 0,p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,...,p343,p344,p345,p346,p347,p348,p349,p350,p351,p352
Bovine,M,N,G,T,E,G,P,N,F,Y,...,,,,T,S,Q,V,A,P,A
S1,M,N,G,T,E,G,P,Y,F,Y,...,A,S,S,,S,S,V,S,P,A
S2,,,,T,E,G,P,Y,F,Y,...,A,S,S,,S,S,V,S,P,A
S3,,,,T,E,G,P,D,F,Y,...,A,S,S,,S,S,V,S,P,A
S4,,,,T,E,G,P,F,F,Y,...,A,S,S,,S,S,V,S,P,A
S5,,,,T,E,G,P,Y,F,Y,...,A,S,S,,S,S,V,S,P,A
S6,M,N,G,T,E,G,P,F,F,Y,...,A,S,S,,S,S,V,S,P,A
S7,,,,T,E,G,P,Y,F,Y,...,A,S,S,,S,S,V,S,P,A
S8,,,,T,E,G,P,D,F,Y,...,A,S,S,,S,S,V,S,P,A
S9,,,,T,E,G,P,D,F,Y,...,A,S,S,,S,S,V,S,P,A


### Preprocessing
In this step, we do all these steps:
1. dropping columns with a number of missing values above a certain threshold  
2. dropping zero entropy columns  
3. imputing missing values with the mode of that column  
4. replacing cases with a frequency below a threshold (default 1.5%) with the mode of that column
5. dropping zero entropy columns
6. use statistical tests (each position against the phenotype) and drop columns with p-values below a threshold (default 0.25)
7. one-hot encode the remaining columns
8. calculate the pair-wise distance matrix for all of the columns
9. use the distance matrix for DBSCAN and cluster the correlated positions together
10. keep only one column (closes to center of each cluster) for each group and drop the rest from the training data set

In [75]:
prep_pipeline = make_pipeline(
    steps=[
        ('mc', MisCare(missing_threshold=0.05)),
        ('cc', ConstantCare()),
        ('ur', URareCare(threshold=0.025)),
        ('cc2', ConstantCare()),
        ('one_hot', CustomOneHotEncoder()),
        ('feature_selection', FeatureSelection(model_type=ana_type, alpha=0.25, keep=False)),
        ('collinear_care', CollinearCare(dist_method='correlation', threshold=0.001, keep=False))
    ])

In [78]:
prep_pipeline = make_pipeline(
    steps=[
        ('mc', MisCare(missing_threshold=0.05)),
        ('cc', ConstantCare()),
        ('aa_prop', AminoAcidPropertyEncoder(props_to_keep = props_to_keep)),
        ('feature_selection', FeatureSelection(model_type=ana_type, alpha=0.10, keep=False)),
        ('collinear_care', CollinearCare(dist_method='correlation', threshold=0.001, keep=False))
    ])

In [79]:
%%time
report, top = model_compare_cv(X=tr, y=y, preprocess_pipe=prep_pipeline,
                               models_dict=get_models(ana_type=ana_type),
                               scoring=get_scores(ana_type=ana_type),
                               report_dir=report_dir,
                               cv=10, ana_type=ana_type, cache_dir=report_dir)

Fitting rf...


MAE = Mean Absolute Error

MSE = Mean Squared Error

RMSE = Rooted Mean Square Error

MAPE = Mean Absolute % Error - the average magnitude of error produced by a model, or how far off predictions are on average. A MAPE value of 20% means that the average absolute percentage difference between the predictions and the actuals is 20%

In [77]:
report

Unnamed: 0,R2,MAE,MSE,RMSE,MAPE
Lasso,0.8370288,154.967,53991.61,227.3443,0.007627078
LassoLars,0.8370213,154.9682,53990.93,227.3457,0.007627143
gbr,0.8345323,150.299,54436.27,228.6599,0.007386128
BayesianRidge,0.8321333,154.6919,56106.39,232.9799,0.007612612
lgbm,0.8012492,170.6518,68117.27,256.4867,0.008368746
rf,0.7957933,165.6351,66023.0,252.7897,0.008118852
xgb,0.769621,157.934,70057.03,255.5304,0.007747022
et,0.7579657,175.0658,75847.18,269.2851,0.008601856
dt,0.7306253,183.5926,85231.14,285.3169,0.009014904
Adaboost,0.5682326,280.6036,136397.4,366.7324,0.01383364


In [None]:
prep_pipeline = make_pipeline(
    steps=[
        ('mc', MisCare(missing_threshold=0.05)),
        ('cc', ConstantCare()),
        ('ur', URareCare(threshold=0.025)),
        ('cc2', ConstantCare()),
        ('one_hot', CustomOneHotEncoder()),
        ('feature_selection', FeatureSelection(model_type=ana_type, alpha=0.10, keep=True)),
        ('collinear_care', CollinearCare(dist_method='correlation', threshold=0.001, keep=True))
    ])

In [37]:
prep_pipeline = make_pipeline(
    steps=[
        ('mc', MisCare(missing_threshold=0.05)),
        ('cc', ConstantCare()),
        ('aa_prop', AminoAcidPropertyEncoder(props_to_keep = props_to_keep)),
        ('feature_selection', FeatureSelection(model_type=ana_type, alpha=0.10, keep=True)),
        ('collinear_care', CollinearCare(dist_method='correlation', threshold=0.001, keep=True))
    ])

In [38]:
modified_top = []
mtml = []
for model in top:
    modified_top.append(make_pipeline(steps=[('prep', prep_pipeline), model.steps[-1]]))
    my_top_models = str(model[1:])
    #print(my_top_models)
    my_top_models = my_top_models.split("'")[3]
    mtml.append(my_top_models)
    #print(my_top_models)

In [39]:
modified_top[0]

In [40]:
%%time
top = finalize_top(X=tr, y=y, top_models=modified_top, grid_param=get_params(),report_dir=report_dir, cv=block_fold_bal_weighted)


Tuning HubR...
Tuning xgb...


In [None]:
%%time
sr = summarize_results(top_models=top, report_dir=report_dir)

In [None]:
sr.head()

In [None]:
scatter_plot = plot_scatter(summary_result=sr, report_dir=report_dir)

In [None]:
%%time
mean_imp = mean_importance(top, report_dir=report_dir)

In [None]:
dp_plot(importance=mean_imp,imp_col='mean', model_name='mean', report_dir=report_dir)

In [None]:
tr = prep_pipeline[:4].fit_transform(tr)

In [None]:
for model in top:
    model_name = model.steps[-1][0]
    dp_plot(importance=importance_from_pipe(model),
            imp_col='standard_value',
            model_name = model_name, report_dir=report_dir)
    
    plot_imp_model(importance=importance_from_pipe(model), 
               X_train=tr, y_train=y, model_name=model_name,
                   meta_var='meta', model_type=ana_type, report_dir=report_dir)

In [None]:
pl = plot_imp_all(final_models=top,
                  X_train=tr, y_train=y,
                  model_type = ana_type,
                  report_dir=report_dir, max_plots=100,
                  figsize=(2.5, 3))

In [None]:
from deepBreaks.utils import load_obj
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error

In [None]:
top_per_mod = report_dir + '/' + mtml[0] + '.pkl'
load_top_mod = load_obj(top_per_mod)

In [None]:
print(f'Here is a list of your top performing models to test...\n{mtml}')

# <font color=red>STEP 4: Translate Candidate STSs</font> 
## This section is used to translate candidate STSs to the bovine or squid equivalent.
### The bovine and squid sequence dataframes that were saved earlier and are called again here


In [None]:
import pandas as pd
import os

In [None]:
#STEP 4: Translate Candidate STSs 
#translate candidate STSs to the bovine or squid equivalent 
#bovine and squid sequence dataframes were saved earlier and are called again here
m = 0
tm = ''
k=0
gaps=0
#import importance_report.csv from report_dir
true_pos = []
aa = []
tmd = []
df = pd.read_csv(f'{report_dir}\importance_report.csv')
#take the list of important sites and translate them to the bovine standard equivalent, 
#we do this by taking the site number and subtracting the number of '-' between the start of the sequence and the desired site. 
for rows in reference_seq.values:  
    rows = str(rows)
    #print(rows)
    if rows == 'nan':
    #We want to write the 'true_pos', 'aa', and 'TMD' to the 'importance_report' csv file
        gaps += 1
        k += 1
        true_pos.append('NA')
        aa.append('-')
        tmd.append('NA')
    else:
        #print("The number of gaps is " + str(gaps))
        k+=1
        trans_site = k - gaps
        if ref_seq_name == 'bovine':
            if trans_site in range(3,37):
                tm = 'N-Termina'
            elif trans_site in range(37,62):
                tm = '1'
            elif trans_site in range(74,96):
                tm = '2'
            elif trans_site in range(111,133):
                tm = '3'
            elif trans_site in range(153,174):
                tm = '4'
            elif trans_site in range(203,225):
                tm = '5'
            elif trans_site in range(253,275):
                tm = '6'
            elif trans_site in range(287,309):
                tm = '7'
            else:
                tm = 'CT/EC'
        else:
            if trans_site in range(3,34):
                tm = 'N-Termina'
            elif trans_site in range(34,59):
                tm = '1'
            elif trans_site in range(71,97):
                tm = '2'
            elif trans_site in range(110,132):
                tm = '3'
            elif trans_site in range(152,173):
                tm = '4'
            elif trans_site in range(200,225):
                tm = '5'
            elif trans_site in range(262,284):
                tm = '6'
            elif trans_site in range(294,315):
                tm = '7'
            else:
                tm = 'CT/EC'                
        
        true_pos.append(str(trans_site))
        aa.append(rows)
        tmd.append(tm)
true_pos.pop()
aa.pop()
tmd.pop()

df['true_position'] = true_pos
df['TMD'] = tmd
df['amino_acid'] = aa
df.to_csv(path_or_buf= os.path.join(report_dir,r'importance_report.csv'),index = 'Feature',mode="w")
#df.head()


# <font color=red>STEP 5: Query the Model to Predict NEW Sequences</font> 
## Takes new sequences, inserts them into existing alignment to properly format for model query, then returns prediction of the λmax value for each sequence...

In [None]:
import os
import subprocess
from deepBreaks.utils import load_obj
from deepBreaks.preprocessing import read_data
import joblib
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from vpod_scripts.prediction_functions_db import process_sequences_from_file

This is a version of the prediction method which can be used DIRECTLY after model training... 

In [None]:
#path to the mafft.bat file - change to your own directory!
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
#path to sequences we want to add to an existing alignment in FASTA format
input_file = './subtests/supp_test_data/msp_erg_raw.txt'
#name for desired output file
output_file = 'opsin_predictions.tsv'
#path to target/selected model
selected_model = report_dir + '/' + mtml[0] + '.pkl'
#function for querying model - this will take care of creating an output file for you.
process_sequences_from_file(mafft_exe,input_file,output_file,selected_model,seqFileName, gap_threshold=gap_threshold)


NameError: name 'report_dir' is not defined

This is a version of the prediction method which can be used to ACCESS EXISTING MODELS in a SEPERATE SESSION after model training... 

In [None]:
#path to the mafft.bat file - change to your own directory!
mafft_exe = 'C:/Users/safra/mafft-win/mafft.bat'
#path to sequences we want to add to an existing alignment in FASTA format
input_file = './subtests/supp_test_data/msp_erg_raw.txt'
#name for desired output file
output_file = 'opsin_predictions.tsv'
#path to the primary alignment used for training the model - if trying access a specific file later
path = 'c:/Users/safra/Documents/GitHub/visual-physiology-opsin-db/vpod_data/VPOD_1.0/formatted_data_subsets/vpod_2023-10-16_12-13-11'
seqFileName = f'{path}/VPOD_wds_het_1.0.fasta' 
#path to target/selected model
#can use method below or enter path manually if coming back to notebook with no variables intialized
report_dir ='C:/Users/safra/Documents/GitHub/visual-physiology-opsin-db/result_files/main_model_results/mafft/wds_model_2023-10-16_12-13-40'
selected_model = report_dir + '/gbc.pkl'
#function for querying model - this will take care of creating an output file for you.
process_sequences_from_file(mafft_exe,input_file,output_file,selected_model,seqFileName)



In [60]:
import numpy as np
import pandas as pd
from Bio import Phylo
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import squareform, pdist


def block_k_fold_cv(tree_file, fold_list, linkage="complete"):
    """
    Performs phylogenetic block k-fold cross-validation.

    Args:
        tree_file (str): Path to the phylogenetic tree file (Newick format).
        fold_list (list): List of the number of folds to test.

    Returns:
        list: List of dictionaries, each containing fold assignments and distance metrics.
    """

    # Load phylogenetic tree
    tree = Phylo.read(tree_file, "newick")
    # Get tip names
    tip_names = [terminal.name for terminal in tree.get_terminals()]
    #dist_matrix = tree.distance_matrix().values

    #Create distance matrix from the tree
    dist_matrix = np.zeros((len(tree.get_terminals()), len(tree.get_terminals())))
    for i, terminal1 in enumerate(tree.get_terminals()):
        for j, terminal2 in enumerate(tree.get_terminals()):
            if i < j:  # Only calculate upper triangle to avoid redundancy
                dist_matrix[i, j] = tree.distance(terminal1, terminal2)
                dist_matrix[j, i] = dist_matrix[i, j]  # Mirror for symmetry
                
    
    results = []

    for n_folds in fold_list:
        # Cluster-based fold assignment
        clustering = AgglomerativeClustering(n_clusters=n_folds, metric ='precomputed', linkage=linkage).fit(dist_matrix)
        block_fold = clustering.labels_
        # Create dictionary to map tip names to fold assignments
        tip_to_fold = dict(zip(tip_names, block_fold))
        
        #block_fold = balance_folds(block_fold, max_imbalance=0.1)

        # Calculate distance metrics for each fold
        block_dist = pd.DataFrame({"FOLD": range(n_folds), "min_DIST": np.nan, "p50_DIST": np.nan, "avg_DIST": np.nan})
        for i in range(n_folds):
            mask = block_fold == i
            distances = dist_matrix[np.ix_(mask, ~mask)]
            block_dist.iloc[i, 1:] = np.array([distances.min(), np.quantile(distances, 0.5), distances.mean()])
            
        results.append({"n_folds": n_folds, "block_fold": block_fold, "tip_to_fold": tip_to_fold, "block_dist": block_dist})
            
    return results, dist_matrix


In [272]:
import numpy as np
import pandas as pd
from Bio import Phylo
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import squareform, pdist, cdist


def block_k_fold_cv(tree_file, fold_list, linkage="complete"):
    """
    Performs phylogenetic block k-fold cross-validation.

    Args:
        tree_file (str): Path to the phylogenetic tree file (Newick format).
        fold_list (list): List of the number of folds to test.

    Returns:
        list: List of dictionaries, each containing fold assignments and distance metrics.
    """

    # Load phylogenetic tree
    tree = Phylo.read(tree_file, "newick")
    # Get tip names
    tip_names = [terminal.name for terminal in tree.get_terminals()]
    #dist_matrix = tree.distance_matrix().values

    #Create distance matrix from the tree
    dist_matrix = np.zeros((len(tree.get_terminals()), len(tree.get_terminals())))
    for i, terminal1 in enumerate(tree.get_terminals()):
        for j, terminal2 in enumerate(tree.get_terminals()):
            if i < j:  # Only calculate upper triangle to avoid redundancy
                dist_matrix[i, j] = tree.distance(terminal1, terminal2)
                dist_matrix[j, i] = dist_matrix[i, j]  # Mirror for symmetry
                
    
    results = []

    for n_folds in fold_list:
        # Cluster-based fold assignment
        clustering = AgglomerativeClustering(n_clusters=n_folds, metric ='precomputed', linkage=linkage, compute_distances=True).fit(dist_matrix)        
        block_fold = clustering.labels_
        unique_labels, cluster_sizes = np.unique(block_fold, return_counts= True)

        # Create dictionary to map tip names to fold assignments
        tip_to_fold = dict(zip(tip_names, block_fold))
        
        cluster_dict = {}
        for index, class_label in enumerate(block_fold):
            if class_label not in cluster_dict:
                cluster_dict[class_label] = []
            cluster_dict[class_label].append(index)
        sorted_keys = sorted(cluster_dict.keys())
        cluster_dict = {key: cluster_dict[key] for key in sorted_keys}
        cluster_indices = []    
        for values in cluster_dict.values():
            cluster_indices.append(values)
        target_size = len(tip_to_fold.keys()) // n_folds
        over_represented_clusters = [i for i, size in enumerate(cluster_sizes) if size > target_size]
        under_represented_clusters = [i for i, size in enumerate(cluster_sizes) if size < target_size]
        
        
        # Reassign members from over-represented clusters to under-represented clusters
        for over_cluster in over_represented_clusters:
            for index in cluster_indices[over_cluster]:
                point_distances = []
                point_indexes = []
                for under_cluster in under_represented_clusters:                                
                        point_index = np.where(block_fold == under_cluster)[0]
                        distances_to_centroid = [dist_matrix[index, ci] for ci in point_index]
                        min_dist = np.argmin(distances_to_centroid)
                        point_distances.append(min_dist)
                        point_indexes.append(under_cluster)
                  
                # Return the minimum distance to the nearest centroid
                add = 0
                while add == 0:
                    min_distance = min(point_distances)
                    closest_under_cluster = point_distances.index(min_distance)
                
                    if cluster_sizes[closest_under_cluster] >= target_size:
                        point_distances=point_distances.remove(min(point_distances))
                    else:
                        block_fold[index] = point_indexes[closest_under_cluster]
                        add+=1

                # Update cluster indices and sizes
                cluster_indices[over_cluster] = cluster_indices[over_cluster].remove(index)
                cluster_indices[closest_under_cluster] = cluster_indices[closest_under_cluster].append(index)
                cluster_sizes[over_cluster] -= 1
                cluster_sizes[closest_under_cluster] += 1
                if cluster_sizes[over_cluster] <= target_size:
                    break
        
        # Update dictionary to map tip names to fold assignments
        tip_to_fold = dict(zip(tip_names, block_fold))
            
        results.append({"n_folds": n_folds, "block_fold": block_fold, "tip_to_fold": tip_to_fold})
            
    return results, dist_matrix


In [3]:
import numpy as np
import pandas as pd
from Bio import Phylo
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import squareform, pdist, cdist
import random
from sklearn.neighbors import kneighbors_graph


def block_k_fold_cv(tree_file, fold_list, linkage="complete"):
    """
    Performs phylogenetic block k-fold cross-validation.

    Args:
        tree_file (str): Path to the phylogenetic tree file (Newick format).
        fold_list (list): List of the number of folds to test.

    Returns:
        list: List of dictionaries, each containing fold assignments and distance metrics.
    """

    # Load phylogenetic tree
    tree = Phylo.read(tree_file, "newick")
    # Get tip names
    tip_names = [terminal.name for terminal in tree.get_terminals()]
    #dist_matrix = tree.distance_matrix().values

    #Create distance matrix from the tree
    dist_matrix = np.zeros((len(tree.get_terminals()), len(tree.get_terminals())))
    for i, terminal1 in enumerate(tree.get_terminals()):
        for j, terminal2 in enumerate(tree.get_terminals()):
            if i < j:  # Only calculate upper triangle to avoid redundancy
                dist_matrix[i, j] = tree.distance(terminal1, terminal2)
                dist_matrix[j, i] = dist_matrix[i, j]  # Mirror for symmetry
                
    
    results = []

    for n_folds in fold_list:
        # Cluster-based fold assignment
        connectivity = kneighbors_graph(dist_matrix, n_neighbors=30, include_self=False)
        clustering = AgglomerativeClustering(n_clusters=n_folds, metric ='precomputed', linkage=linkage, compute_distances=True).fit(dist_matrix)        
        block_fold = clustering.labels_
        unique_labels, cluster_sizes = np.unique(block_fold, return_counts= True)

        # Create dictionary to map tip names to fold assignments
        tip_to_fold = dict(zip(tip_names, block_fold))
        
        cluster_dict = {}
        for index, class_label in enumerate(block_fold):
            if class_label not in cluster_dict:
                cluster_dict[class_label] = []
            cluster_dict[class_label].append(index)
        sorted_keys = sorted(cluster_dict.keys())
        cluster_dict = {key: cluster_dict[key] for key in sorted_keys}
        cluster_indices = []    
        for values in cluster_dict.values():
            cluster_indices.append(values)
        target_size = len(tip_to_fold.keys()) // n_folds
        print(f"Targe Size = {target_size}")
        for x, cluster in enumerate(cluster_sizes):
            print(f"Cluster {x}: {cluster} members")
        over_represented_clusters = [i for i, size in enumerate(cluster_sizes) if size > target_size]
        print(f"Over-Represented Clusters = {over_represented_clusters}")
        under_represented_clusters = [i for i, size in enumerate(cluster_sizes) if size < target_size]
        print(f"Under-Represented Clusters = {under_represented_clusters}")
        
        # Reassign members from over-represented clusters to under-represented clusters
        for over_cluster in over_represented_clusters:
            print(f"Target Over-Represented Cluster = {over_cluster}")
            print(f"Here are the elements of the Target Over-represented Cluster = {cluster_indices[over_cluster]}")
            #cluster_indices[over_cluster] = random.shuffle(cluster_indices[over_cluster])
            #print(f"Here are the elements of the Target Over-represented Cluster Now Shuffled = {cluster_indices[over_cluster]}")

            for index in cluster_indices[over_cluster]:
                #distances to nearest point per cluster
                point_distances = []
                #Index for the closest points per cluster 
                point_indexes = []
                for under_cluster in under_represented_clusters:      
                    #start here - need to use cluster_indices         
                        print(f"Target Under-Represented Cluster = {over_cluster}")
                        print(f"Here are the elements of the Target Under-represented Cluster = {cluster_indices[under_cluster]}")
                        point_index = cluster_indices[under_cluster]
                        distances_to_index = [dist_matrix[index, ci] for ci in point_index]
                        min_dist = min(distances_to_index)
                        point_distances.append(min_dist)
                        point_indexes.append(under_cluster)
                  
                # Return the minimum distance to the nearest centroid
                print(f"Here are is the list of point distances for {index}: {point_distances} \n And the list of corresponding clusters: {point_indexes}")
                min_distance = min(point_distances)
                print(f"Here is the minimum distance: {min_distance}")
                target = point_distances.index(min_distance)
                print(f"Here is the closest cluster: {point_indexes[target]}")
                closest_under_cluster = point_indexes[target]
                block_fold[index] = closest_under_cluster
                cluster_sizes[over_cluster] -= 1
                cluster_sizes[closest_under_cluster] += 1
                for x, cluster in enumerate(cluster_sizes):
                    print(f"Cluster {x}: {cluster} members")

                for under_rep in under_represented_clusters:
                    if cluster_sizes[under_rep] >= target_size:
                        under_represented_clusters.remove(under_rep)
                if len(under_represented_clusters) == 0:
                    break
                if cluster_sizes[over_cluster] == target_size:
                    break
            if len(under_represented_clusters) == 0:
                break
                     
        # Update dictionary to map tip names to fold assignments
        tip_to_fold = dict(zip(tip_names, block_fold))
            
        results.append({"n_folds": n_folds, "block_fold": block_fold, "tip_to_fold": tip_to_fold})
            
    return results, dist_matrix


In [4]:
# Example usage
tree_file = "wt_aligned_VPOD_1.1_het.fasta.treefile"
fold_list = [10]  # Example folds to test

cv_results, dist_matrix = block_k_fold_cv(tree_file, fold_list, linkage = "complete")

Targe Size = 36
Cluster 0: 3 members
Cluster 1: 84 members
Cluster 2: 183 members
Cluster 3: 17 members
Cluster 4: 48 members
Cluster 5: 1 members
Cluster 6: 23 members
Cluster 7: 1 members
Cluster 8: 1 members
Cluster 9: 1 members
Over-Represented Clusters = [1, 2, 4]
Under-Represented Clusters = [0, 3, 5, 6, 7, 8, 9]
Target Over-Represented Cluster = 1
Here are the elements of the Target Over-represented Cluster = [180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263]
Target Under-Represented Cluster = 1
Here are the elements of the Target Under-represented Cluster = [174, 175, 176]
Target Under-Represented Cluster = 1
Here 

In [5]:
cv_results

[{'n_folds': 10,
  'block_fold': array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
         6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7,
         7, 7, 7, 7, 7, 7, 7, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
         6, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 0, 0,
         0, 9, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8, 8,
         8, 8, 8, 8, 8, 8, 8, 8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 3, 3, 

In [6]:
n_folds = cv_results[0]["n_folds"]
block_fold = cv_results[0]["block_fold"].tolist()
tip_to_fold = cv_results[0]["tip_to_fold"]

In [91]:
import numpy as np
import pandas as pd
from Bio import Phylo
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform
import matplotlib.pyplot as plt


def block_k_fold_cv(tree_file, fold_list, criterion='maxclust'):
    """
    Performs phylogenetic block k-fold cross-validation.

    Args:
        tree_file (str): Path to the phylogenetic tree file (Newick format).
        fold_list (list): List of the number of folds to test.

    Returns:
        list: List of dictionaries, each containing fold assignments and distance metrics.
    """

    # Load phylogenetic tree
    tree = Phylo.read(tree_file, "newick")
    # Get tip names
    tip_names = [terminal.name for terminal in tree.get_terminals()]
    #dist_matrix = tree.distance_matrix().values

    #Create distance matrix from the tree
    dist_matrix = np.zeros((len(tree.get_terminals()), len(tree.get_terminals())))
    for i, terminal1 in enumerate(tree.get_terminals()):
        for j, terminal2 in enumerate(tree.get_terminals()):
            if i < j:  # Only calculate upper triangle to avoid redundancy
                dist_matrix[i, j] = tree.distance(terminal1, terminal2)
                dist_matrix[j, i] = dist_matrix[i, j]  # Mirror for symmetry
    dist_matrix = squareform(dist_matrix)
    
    results = []

    for n_folds in fold_list:
        # Perform hierarchical clustering
        Z = linkage(dist_matrix, method='ward')
        print(Z)
        # Plot the dendrogram
        plt.figure(figsize=(10, 7))
        dendrogram(Z)
        plt.title("Dendrogram")
        plt.xlabel("Taxa")
        plt.ylabel("Distance")
        plt.show()
        # Define the number of clusters and ensure equal-sized clusters
        block_fold = fcluster(Z, n_folds, criterion=criterion)
        print(block_fold)
        # Create dictionary to map tip names to fold assignments
        tip_to_fold = dict(zip(tip_names, block_fold))
            
        results.append({"n_folds": n_folds, "block_fold": block_fold, "tip_to_fold": tip_to_fold})
            
    return results, dist_matrix


In [None]:
# Example usage
tree_file = "wt_aligned_VPOD_1.1_het.fasta.treefile"
fold_list = [5,10]  # Example folds to test

cv_results, dist_matrix = block_k_fold_cv(tree_file, fold_list)

In [78]:
cv_results

[{'n_folds': 10,
  'block_fold': array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
         6, 6, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 0, 0,
         0, 9, 8, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [269]:
n_folds = cv_results[0]["n_folds"]
block_fold = cv_results[0]["block_fold"].tolist()
tip_to_fold = cv_results[0]["tip_to_fold"]

In [270]:
unique_classes, class_counts = np.unique(block_fold, return_counts=True)
print(class_counts)

[48 35 42 48 29 39 43 39 39]


In [271]:
unique_classes

array([0, 1, 3, 4, 5, 6, 7, 8, 9])

In [None]:

def balance_folds(fold_assignments, max_imbalance):
    """
    Balances fold assignments by iteratively moving samples between folds.

    Args:
        fold_assignments (np.ndarray): Initial fold assignments for each sample.
        max_imbalance (float): Maximum allowed imbalance ratio (0 to 1).

    Returns:
        np.ndarray: Balanced fold assignments.
    """
    n_folds = len(np.unique(fold_assignments))
    fold_sizes = np.bincount(fold_assignments)  # Count samples per fold
    
    # Ensure all folds are used initially
    if (fold_sizes == 0).any(): 
        raise ValueError("All folds must be used for balancing to work.")

    # Convert to condensed distance for later use
    tree_dist = squareform(dist_matrix)

    while True:
        min_size = fold_sizes.min()
        max_size = fold_sizes.max()
        imbalance = (max_size - min_size) / max_size

        if imbalance <= max_imbalance:
            break  # Imbalance within tolerance

        min_fold = np.argmin(fold_sizes)
        max_fold = np.argmax(fold_sizes)

        # Find sample in max_fold closest to min_fold (use average distance to all in min_fold)
        max_fold_samples = np.where(fold_assignments == max_fold)[0]
        min_fold_samples = np.where(fold_assignments == min_fold)[0]
        avg_dists = np.mean(tree_dist[np.ix_(max_fold_samples, min_fold_samples)])
        sample_to_move = max_fold_samples[np.argmin(avg_dists)]

        # Move sample
        fold_assignments[sample_to_move] = min_fold
        fold_sizes[min_fold] += 1
        fold_sizes[max_fold] -= 1

    return fold_assignments



In [34]:
import numpy as np

def rebalance_classes_proportional(class_list, max_diff_ratio=0.1):
    """
    Rebalances class labels in a list with proportional maximum difference.

    Args:
        class_list: List of integers representing class labels (0 to 9).
        max_diff_ratio: Maximum allowed difference ratio between the most and least frequent classes.

    Returns:
        List of rebalanced class labels.
    """
    unique_classes, class_counts = np.unique(class_list, return_counts=True)
    target_count = len(class_list) // len(unique_classes)  # Ideal count per class
    
    # Calculate maximum allowed difference as a ratio of target count
    max_diff = max_diff_ratio * target_count

    while max(class_counts) - min(class_counts) > max_diff:
        over_represented = np.argmax(class_counts)
        under_represented = np.argmin(class_counts)

        over_indices = np.where(class_list == over_represented)[0]
        random_index = np.random.choice(over_indices)
        class_list[random_index] = under_represented

        class_counts[over_represented] -= 1
        class_counts[under_represented] += 1

    return class_list


In [43]:
block_fold_bal = rebalance_classes_proportional(block_fold)

ValueError: 'a' cannot be empty unless no samples are taken

In [None]:
unique_classes, class_counts = np.unique(block_fold_bal, return_counts=True)

In [None]:
class_counts

array([24, 25, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 25, 23, 23],
      dtype=int64)

In [None]:
import numpy as np

def rebalance_classes_closeness(class_list, max_diff_ratio=0.1):
    """
    Rebalances class labels with proportional maximum difference and closeness preference.

    Args:
        class_list: List of integers representing class labels (0 to 9).
        max_diff_ratio: Maximum allowed difference ratio between the most and least frequent classes.

    Returns:
        List of rebalanced class labels.
    """
    unique_classes, class_counts = np.unique(class_list, return_counts=True)
    target_count = len(class_list) // len(unique_classes)
    max_diff = max_diff_ratio * target_count

    while max(class_counts) - min(class_counts) > max_diff:
        over_represented = np.argmax(class_counts)
        potential_under_represented = []

        # Check neighboring classes for under-representation
        for neighbor in [over_represented - 1, over_represented + 1]:
            if 0 <= neighbor < len(unique_classes) and class_counts[neighbor] < target_count:
                potential_under_represented.append(neighbor)

        # If no under-represented neighbors, fall back to the least frequent class
        if not potential_under_represented:
            potential_under_represented = [np.argmin(class_counts)]

        under_represented = np.random.choice(potential_under_represented)

        over_indices = np.where(class_list == over_represented)[0]
        random_index = np.random.choice(over_indices)
        class_list[random_index] = under_represented

        class_counts[over_represented] -= 1
        class_counts[under_represented] += 1

    return class_list


In [None]:
block_fold_bal_closeness = rebalance_classes_closeness(block_fold)

In [None]:
unique_classes, class_counts = np.unique(block_fold_bal_closeness, return_counts=True)

In [None]:
class_counts

array([24, 25, 24, 25, 25, 25, 24, 24, 24, 24, 24, 23, 25, 23, 23],
      dtype=int64)

In [68]:
import numpy as np

def rebalance_classes_weighted(class_list, max_diff_ratio=0.1):
    """
    Rebalances class labels with proportional maximum difference, circular neighbors, and closeness weighting.

    Args:
        class_list: List of integers representing class labels (0 to 9).
        max_diff_ratio: Maximum allowed difference ratio between the most and least frequent classes.

    Returns:
        List of rebalanced class labels.
    """
    unique_classes, class_counts = np.unique(class_list, return_counts=True)
    target_count = len(class_list) // len(unique_classes)
    max_diff = max_diff_ratio * target_count
    num_classes = len(unique_classes)

    while max(class_counts) - min(class_counts) > max_diff:
        over_represented = np.argmax(class_counts)

        # Find under-represented neighbors (circular wrapping)
        potential_under_represented = []
        neighbor_weights = []
        for offset in range(-over_represented, len(class_counts)-over_represented):
            neighbor = (over_represented + offset) % num_classes  # Circular wrapping
            if class_counts[neighbor] < target_count:
                potential_under_represented.append(neighbor)
                neighbor_weights.append(1 / abs(offset))  # Closer neighbor gets higher weight

        # If no neighbors, fall back to least frequent class
        if not potential_under_represented:
            potential_under_represented = [np.argmin(class_counts)]
            neighbor_weights = [1]  # Equal weight in fallback case

        # Choose under-represented class based on weights
        under_represented = np.random.choice(potential_under_represented, p=neighbor_weights/np.sum(neighbor_weights))

        over_indices = np.where(class_list == over_represented)[0]
        random_index = np.random.choice(over_indices)
        class_list[random_index] = under_represented

        class_counts[over_represented] -= 1
        class_counts[under_represented] += 1

    return class_list


In [79]:
block_fold_bal_weighted = rebalance_classes_weighted(block_fold)
unique_classes, class_counts = np.unique(block_fold_bal_weighted, return_counts=True)
class_counts

array([36, 37, 37, 36, 37, 36, 36, 36, 35, 36], dtype=int64)

In [None]:
unique_classes, class_counts = np.unique(block_fold, return_counts=True)

In [156]:

print(clust_inx_list)

[[174, 175, 176], [180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309,

In [138]:
centroids = np.array([np.mean(dist_matrix[indices][:, indices], axis=0) for indices in clust_inx_list])
print(f"Cluster Centroids: {centroids}")

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (10,) + inhomogeneous part.

In [157]:
centroids = []


In [158]:
for indices in clust_inx_list:
    cluster_distances = dist_matrix[indices][:, indices]
    centroid = np.mean(cluster_distances, axis=1)
    centroids.append(centroid)

In [159]:
centroids 

[array([0.84521336, 0.84299025, 1.6247732 ]),
 array([1.49908782, 1.58305427, 0.9500227 , 0.9500227 , 0.95002179,
        0.96504995, 1.05384413, 1.081684  , 1.04185651, 1.00812182,
        0.98193217, 0.928692  , 1.08700225, 1.05347598, 0.75589548,
        0.82505041, 0.63841822, 0.76795457, 0.69439228, 0.69439209,
        0.81677302, 0.78183355, 0.74181101, 0.74181101, 0.80462941,
        0.80462962, 0.73530975, 0.80903195, 0.80903195, 0.80903192,
        0.75975985, 0.75976118, 0.75659886, 0.66527665, 0.64580484,
        0.63620611, 0.81153119, 0.81153099, 0.8082148 , 0.86570415,
        0.80663475, 0.7781148 , 0.80689291, 0.8547426 , 0.8449757 ,
        0.8449757 , 0.80470083, 0.72392795, 0.69042127, 0.68163795,
        0.66233079, 0.73982392, 0.74084575, 0.62684903, 0.62374993,
        0.8705888 , 0.62550635, 1.32441363, 1.32441363, 1.24132957,
        1.24132957, 1.18727163, 1.15973846, 1.16421296, 1.16421296,
        1.19608227, 1.21636295, 1.21636295, 1.25063702, 1.25388388,
  

In [160]:
block_fold_corr = block_fold.copy()

In [181]:
# Identify over-represented and under-represented clusters
cluster_sizes = [len(indices) for indices in clust_inx_list]
target_size = len(tip_to_fold.keys()) // n_folds
over_represented_clusters = np.array([i for i, size in enumerate(cluster_sizes) if size > target_size])
under_represented_clusters = np.array([i for i, size in enumerate(cluster_sizes) if size < target_size])


In [182]:
over_represented_clusters

array([1, 2, 4])

In [183]:
under_represented_clusters

array([0, 3, 5, 6, 7, 8, 9])

In [195]:
centroids[0][2]

1.6247731950000002

In [232]:

# Initialize centroid distances matrix
centroids = np.zeros((len(unique_classes), len(unique_classes)))

# Calculate pairwise distances between clusters
for i in range(len(unique_classes)):
    for j in range(i, len(unique_classes)):
        points_i = np.where(block_fold == unique_classes[i])[0]
        points_j = np.where(block_fold == unique_classes[j])[0]
        
        if i == j:
            if len(points_i == 1):
                #print(points_i)
                #dist_sum = np.sum(dist_matrix[np.ix_(points_i, points_i)])
                centroids[i, j] = 0 
            else:
                # Distance within the same cluster
                dist_sum = np.sum(dist_matrix[np.ix_(points_i, points_i)]) / (len(points_i) * (len(points_i) - 1))
                centroids[i, j] = dist_sum
            
        else:
            # Distance between clusters
            dist_sum = np.sum(dist_matrix[np.ix_(points_i, points_j)]) / (len(points_i) * len(points_j))
            centroids[i, j] = dist_sum
            centroids[j, i] = dist_sum

In [233]:
centroids

array([[0.        , 4.64132991, 4.89048096, 5.34763693, 4.42286914,
        4.17780223, 4.80808204, 5.53398347, 4.28879572, 4.02246652],
       [4.64132991, 0.        , 1.644479  , 4.73906996, 1.94384228,
        2.91087522, 4.19951507, 4.9254165 , 3.38800752, 4.11810737],
       [4.89048096, 1.644479  , 0.        , 4.98822101, 2.19299333,
        3.16002627, 4.44866612, 5.17456755, 3.63715857, 4.36725842],
       [5.34763693, 4.73906996, 4.98822101, 0.        , 4.52060918,
        4.27554228, 2.66235895, 2.67419461, 4.38653577, 4.82441439],
       [4.42286914, 1.94384228, 2.19299333, 4.52060918, 0.        ,
        2.69241444, 3.9810543 , 4.70695573, 3.16954675, 3.8996466 ],
       [4.17780223, 2.91087522, 3.16002627, 4.27554228, 2.69241444,
        0.        , 3.73598739, 4.46188882, 2.92447984, 3.65457969],
       [4.80808204, 4.19951507, 4.44866612, 2.66235895, 3.9810543 ,
        3.73598739, 0.        , 2.84870549, 3.84698088, 4.2848595 ],
       [5.53398347, 4.9254165 , 5.1745675

In [225]:

# Initialize centroid distances matrix
centroids = np.zeros((len(unique_classes), len(unique_classes)))

# Calculate pairwise distances between clusters
for i in range(len(unique_classes)):
    for j in range(i, len(unique_classes)):
        points_i = np.where(block_fold == unique_classes[i])[0]
        points_j = np.where(block_fold == unique_classes[j])[0]
        
        if i == j:
            # Distance within the same cluster
            dist_sum = np.sum(dist_matrix[np.ix_(points_i, points_i)]) / (len(points_i) * (len(points_i) - 1))
            centroids[i, j] = dist_sum
            
        else:
            # Distance between clusters
            dist_sum = np.sum(dist_matrix[np.ix_(points_i, points_j)]) / (len(points_i) * len(points_j))
            centroids[i, j] = dist_sum
            centroids[j, i] = dist_sum

  dist_sum = np.sum(dist_matrix[np.ix_(points_i, points_i)]) / (len(points_i) * (len(points_i) - 1))


In [226]:
centroids

array([[1.65648841, 4.64132991, 4.89048096, 5.34763693, 4.42286914,
        4.17780223, 4.80808204, 5.53398347, 4.28879572, 4.02246652],
       [4.64132991, 0.96098866, 1.644479  , 4.73906996, 1.94384228,
        2.91087522, 4.19951507, 4.9254165 , 3.38800752, 4.11810737],
       [4.89048096, 1.644479  , 0.64932319, 4.98822101, 2.19299333,
        3.16002627, 4.44866612, 5.17456755, 3.63715857, 4.36725842],
       [5.34763693, 4.73906996, 4.98822101, 1.15686933, 4.52060918,
        4.27554228, 2.66235895, 2.67419461, 4.38653577, 4.82441439],
       [4.42286914, 1.94384228, 2.19299333, 4.52060918, 0.28132368,
        2.69241444, 3.9810543 , 4.70695573, 3.16954675, 3.8996466 ],
       [4.17780223, 2.91087522, 3.16002627, 4.27554228, 2.69241444,
               nan, 3.73598739, 4.46188882, 2.92447984, 3.65457969],
       [4.80808204, 4.19951507, 4.44866612, 2.66235895, 3.9810543 ,
        3.73598739, 0.8580246 , 2.84870549, 3.84698088, 4.2848595 ],
       [5.53398347, 4.9254165 , 5.1745675

In [243]:
index = 100

In [245]:
unique_labels = np.unique(block_fold)
point_label = block_fold[index]
point_distances = []
    
for i, label in enumerate(unique_labels):
    if label == point_label:
        # Skip centroid calculation for the point's own cluster
        continue
    # Distance from the point to this centroid
    else:
        point_index = np.where(block_fold == label)[0]
        if len(point_index) == 0:
            continue
    
        distances_to_centroid = [dist_matrix[index, ci] for ci in point_index]
        point_distances.append(np.argmin(distances_to_centroid))
        
if len(point_distances) == 0:
    # If no valid centroids found
    np.nan
    
# Return the minimum distance to the nearest centroid
min_distance = min(point_distances)

print(point_distances.index(min_distance))

4
1
