In [None]:
import os
import sys

In [None]:
import numpy as np
import pandas as pd

In [None]:
from biclustlib.algorithms import ChengChurchAlgorithm

In [None]:
import pygad

In [None]:
from evaluation import MSR

### Load Data

In [None]:
df = pd.read_csv("Data_File.csv",index_col=0,header=0)

In [None]:
df.head(5)

In [None]:
data = df.to_numpy()

In [None]:
MAX_MSR = MSR(data)

### Chen and Church!

In [None]:
num_generations = 250
num_parents_mating = 10

sol_per_pop = 10

num_genes = 3
gene_type = [int,[float,4],[float,4]]
gene_space = [{'low': 2, 'high': 20} ,{'low':0.01, 'high':MAX_MSR},{'low':1.0001, 'high':10.0}]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"
crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_CCA(solution,solution_idx):
    global data
    nClusters = solution[0]
    MSRT = solution[1]
    MNDT = solution[2]
    
    # Create the CCA process
    model_instance = ChengChurchAlgorithm(num_biclusters=nClusters,
                                 msr_threshold=MSRT,
                                 multiple_node_deletion_threshold=MNDT)
    
    # Obtain clustering results
    model_biclusters = model_instance.run(data)
    MSR_Instance = 0
    for bic in model_biclusters.biclusters:
        bic_data = data[bic.rows,:][:,bic.cols]
        MSR_Instance += MSR(bic_data) 
    
    #fitness = MSR_Instance
    return MSR_Instance

In [None]:
ga_instance_CCA = pygad.GA(num_generations = num_generations,
                               num_parents_mating = num_parents_mating,
                               fitness_func = fitness_func_CCA,
                               sol_per_pop = sol_per_pop,
                               num_genes = num_genes,
                               gene_type = gene_type,
                               gene_space = gene_space,
                               parent_selection_type = parent_selection_type,
                               keep_parents = keep_parents,
                               crossover_type = crossover_type,
                               crossover_probability = crossover_probability,
                               mutation_type = mutation_type,
                               mutation_probability = mutation_probability,
                               mutation_percent_genes = mutation_percent_genes,
                               parallel_processing = 4)

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_CCA.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_CCA.best_solution()
    nClust = best_solution[0]
    MSRT = best_solution[1]
    MNDT = best_solution[2]
    data_saver.append([nClust,
                       MSRT,
                       MNDT,
                       best_solution_fitness])
data_saver_columns = ["number of clusters", "MSR Threshold", "NODE Deletion Threshold", "Best Cummulative MSR"]
df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)
df_data_saver.to_csv(f"Output/CCA_1067_Common_Neuro_Output.csv")

In [None]:
location = np.where(df_data_saver["Best Cummulative MSR"]==np.min(df_data_saver["Best Cummulative MSR"]))
nClust = df_data_saver["number of clusters"][(location[0].tolist())[0]]
MSRT = df_data_saver["MSR Threshold"][(location[0].tolist())[0]]
MNDT = df_data_saver["NODE Deletion Threshold"][(location[0].tolist())[0]]

model_instance = ChengChurchAlgorithm(num_biclusters=nClust,
                                 msr_threshold=MSRT,
                                 multiple_node_deletion_threshold=MNDT)
    
# Obtain clustering results
model_biclusters = model_instance.run(data)

In [None]:
import plotly.express as px
bic_num = 1
for bic in model_biclusters.biclusters:
    bic_data = df.iloc[bic.rows,bic.cols]
    fig = px.imshow(bic_data)
    fig.write_image(f"CCA_nClus_{nClust}_bic_num_{bic_num}.png")
    bic_num +=1

## Plaid_Bic

In [None]:
from biclustlib.algorithms import Plaid

In [None]:
num_generations = 250
num_parents_mating = 10

sol_per_pop = 10

num_genes = 3
gene_type = [int,[float,4],[float,4]]
gene_space = [{'low': 2, 'high': 20},
              {'low':0.01, 'high': 0.99},
              {'low':0.01, 'high':0.99}]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"
crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_Plaid(solution,solution_idx):
    global data
    nClus = solution[0]
    rpth = solution[1]
    cpth = solution[2]
    
    # Create the CCA process
    model_instance = Plaid(num_biclusters=nClus,
                           fit_background_layer=True,
                           row_prunning_threshold=rpth,
                           col_prunning_threshold=cpth,
                           significance_tests=0,
                           back_fitting_steps=1,
                           initialization_iterations=6,
                           iterations_per_layer=10)
    
    # Obtain clustering results
    model_biclusters = model_instance.run(data)
    MSR_Instance = 0
    for bic in model_biclusters.biclusters:
        bic_data = data[bic.rows,:][:,bic.cols]
        MSR_Instance += MSR(bic_data) 
    
    #fitness = MSR_Instance
    return MSR_Instance

In [None]:
ga_instance_Plaid = pygad.GA(num_generations = num_generations,
                               num_parents_mating = num_parents_mating,
                               fitness_func = fitness_func_Plaid,
                               sol_per_pop = sol_per_pop,
                               num_genes = num_genes,
                               gene_type = gene_type,
                               gene_space = gene_space,
                               parent_selection_type = parent_selection_type,
                               keep_parents = keep_parents,
                               crossover_type = crossover_type,
                               crossover_probability = crossover_probability,
                               mutation_type = mutation_type,
                               mutation_probability = mutation_probability,
                               mutation_percent_genes = mutation_percent_genes,
                               parallel_processing = 4)

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_Plaid.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_Plaid.best_solution()
    nClust = best_solution[0]
    rpth = best_solution[1]
    cpth = best_solution[2]
    data_saver.append([nClust,
                       rpth,
                       cpth,
                       best_solution_fitness])
    data_saver_columns = ["number of Biclusters",
                          "Row pruning threshold",
                          "Col pruning threshold",
                          "Best Cummulative MSR"]
    df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)
    df_data_saver.to_csv(f"Output/Plaid_1067_Common_Neuro_Output.csv")

In [None]:
location = np.where(df_data_saver["Best Cummulative MSR"]==np.min(df_data_saver["Best Cummulative MSR"]))
nClust = df_data_saver["number of Biclusters"][(location[0].tolist())[0]]
rpth = df_data_saver["Row pruning threshold"][(location[0].tolist())[0]]
cpth = df_data_saver["Col pruning threshold"][(location[0].tolist())[0]]

In [None]:
model_instance = Plaid(num_biclusters=nClust,
                           fit_background_layer=True,
                           row_prunning_threshold=rpth,
                           col_prunning_threshold=cpth,
                           significance_tests=0,
                           back_fitting_steps=1,
                           initialization_iterations=6,
                           iterations_per_layer=10)
    
    # Obtain clustering results
model_biclusters = model_instance.run(data)

In [None]:
import plotly.express as px
bic_num = 1
for bic in model_biclusters.biclusters:
    bic_data = df.iloc[bic.rows,bic.cols]
    fig = px.imshow(bic_data)
    fig.write_image(f"Plaid_nClus_{nClust}_bic_num_{bic_num}.png")
    bic_num +=1

## Spectral Biclustering

In [None]:
from sklearn.cluster import SpectralBiclustering

In [None]:
num_generations = 250
num_parents_mating = 10

sol_per_pop = 10

num_genes = 2
gene_type = [int,int]
gene_space = [{'low': 2, 'high': 20},
              {'low': 2, 'high': 20}]

parent_selection_type="sss"

keep_parents = 4

crossover_type = "uniform"
crossover_probability=0.2

mutation_type="random"
mutation_probability = 0.2
mutation_percent_genes = 10

In [None]:
def fitness_func_Spectral(solution,solution_idx):
    global data
    nclusRow = solution[0]
    nclusCol = solution[1]
    n_clusters = (nclusRow, nclusCol)
    # Create the CCA process
    model_instance = SpectralBiclustering(n_clusters=n_clusters,
                                          method="log",
                                          random_state=0)
    
    # Obtain clustering results
    model_instance.fit(data)
    MSR_Instance_matrix = np.zeros((nclusRow,nclusCol))
    for itr in range(nclusRow*nclusCol):
        bic_data = data[model_instance.biclusters_[0][itr,:],:][:,model_instance.biclusters_[1][itr,:]]
        MSR_bic = MSR(bic_data) 
        MSR_Instance_matrix[int(itr/nclusCol),itr%nclusCol] = MSR_bic
    
    return np.sum(MSR_Instance_matrix)

In [None]:
ga_instance_Spectral = pygad.GA(num_generations = num_generations,
                               num_parents_mating = num_parents_mating,
                               fitness_func = fitness_func_Spectral,
                               sol_per_pop = sol_per_pop,
                               num_genes = num_genes,
                               gene_type = gene_type,
                               gene_space = gene_space,
                               parent_selection_type = parent_selection_type,
                               keep_parents = keep_parents,
                               crossover_type = crossover_type,
                               crossover_probability = crossover_probability,
                               mutation_type = mutation_type,
                               mutation_probability = mutation_probability,
                               mutation_percent_genes = mutation_percent_genes,
                               parallel_processing = 4)

In [None]:
data_saver = []
for itr in range(5):
    ga_instance_Spectral.run()

    best_solution, best_solution_fitness, best_solution_idx = ga_instance_Spectral.best_solution()
    nclusRow = best_solution[0]
    nclusCol = best_solution[1]
    data_saver.append([nclusRow*nclusCol,
                       nclusRow,
                       nclusCol,
                       best_solution_fitness])
    data_saver_columns = ["number of Biclusters",
                          "Row Clusters",
                          "Col Clusters",
                          "Best Cummulative MSR"]
    df_data_saver = pd.DataFrame(data_saver,columns = data_saver_columns)
    df_data_saver.to_csv(f"Output/Spectral_1067_Common_Neuro_Output.csv.csv")

In [None]:
location = np.where(df_data_saver["Best Cummulative MSR"]==np.min(df_data_saver["Best Cummulative MSR"]))
nclusRow = df_data_saver["Row Clusters"][(location[0].tolist())[0]]
nclusCol = df_data_saver["Col Clusters"][(location[0].tolist())[0]]

n_clusters = (nclusRow, nclusCol)
# Create the CCA process
model_instance = SpectralBiclustering(n_clusters=n_clusters,
                                      method="log",
                                      random_state=0)

# Obtain clustering results
model_instance.fit(data)

In [None]:
num = 1
for itr in range(nclusRow*nclusCol):
    bic_data = df.iloc[model_instance.biclusters_[0][itr,:],model_instance.biclusters_[1][itr,:]]
    fig = px.imshow(bic_data)
    fig.write_image(f"Spectral_clust{num}.png")
    num +=1