#### Description

This code finds the optimal values the following parameters to minimize error and algorithm runtime. It uses the pLac promoter with the -10, spacer, and -35 removed. This algorithm can precisely explore the sequence landscape and find pLac inserts that meet the relative expressions of 0.1 and higher. Therefore, we optimize the algorithm parameters to find pLac inserts with relative expressions of 0.

Parameters:
* pop_size
* generations
* base_mutation_rate
* chromosomes
* islands
    * gene_flow_rate
* surval_rate
* num_parents
* num_competitors
* selection
    * boltzmann_temperature

Notes:
* In these tests, we find which parameters have interdependance or influence each other and grid search each parameter (or combination if correlated) seperately.
* Because there is stochasticity in the results, we run each combination of parameters multiple times.
* More robust testing may optimize for a grid or random relative expression levels.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import time

from GA_2_2 import GeneticAlgorithm

cnn_model_path = '../Models/CNN_6_1_2.keras'
masked_sequence = 'AATACTAGAGGTCTTCCGACNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTGTGGAGCGGGAAGACAACTAGGGG'
target_expression = 0
precision = 0.001
run_per_combination = 20

In [2]:
def test_combination(**kwargs):
    errors = []
    run_times = []
    for run_id in range(run_per_combination):
        ga = GeneticAlgorithm(
            cnn_model_path=cnn_model_path,
            masked_sequence=masked_sequence,
            target_expression=target_expression,
            precision=precision,
            print_progress=False,
            **kwargs
        )
        # Time the run
        start_time = time.time()
        best_sequence, best_prediction = ga.run()
        end_time = time.time()

        # Record the results
        errors.append(abs(best_prediction - target_expression))
        run_times.append(end_time - start_time)

    return np.mean(errors), np.mean(run_times)

def heatmap(results_df, index, columns):
    error_pivot_table = results_df.pivot_table(values='error', index=index, columns=columns, aggfunc='mean')
    runtime_pivot_table = results_df.pivot_table(values='run_time', index=index, columns=columns, aggfunc='mean')
    fig, axes = plt.subplots(1, 2, figsize=(18, 6))

    # Heatmap for Mean Error
    sns.heatmap(error_pivot_table, annot=True, fmt=".2f", cmap="viridis", ax=axes[0])
    axes[0].set_title(f'Mean Error for {index} and {columns}')
    axes[0].set_xlabel(columns)
    axes[0].set_ylabel(index)

    # Heatmap for Run Time
    sns.heatmap(runtime_pivot_table, annot=True, fmt=".2f", cmap="viridis", ax=axes[1])
    axes[1].set_title(f'Run Time for {index} and {columns}')
    axes[0].set_xlabel(columns)
    axes[0].set_ylabel(index)

    # Adjust layout for better display
    plt.tight_layout()
    plt.show()

def scatter_plot(results_df, index, polynomial_degree):
    # Create the subplots
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    # Error vs Number of Parents
    axes[0].scatter(results_df[index], results_df['error'], label='Error Data', alpha=0.7)
    z_error_parents = np.polyfit(results_df[index], results_df['error'], polynomial_degree)
    p_error_parents = np.poly1d(z_error_parents)
    axes[0].plot(results_df[index], p_error_parents(results_df[index]), color='red', linestyle='--', label='Quadratic Best Fit')
    axes[0].set_xlabel(index)
    axes[0].set_ylabel('Error')
    axes[0].set_title(f'Error vs {index}')
    axes[0].legend()

    # Runtime vs Number of Parents
    axes[1].scatter(results_df[index], results_df['run_time'], label='Runtime Data', alpha=0.7)
    z_runtime_parents = np.polyfit(results_df[index], results_df['run_time'], polynomial_degree)
    p_runtime_parents = np.poly1d(z_runtime_parents)
    axes[1].plot(results_df[index], p_runtime_parents(results_df[index]), color='blue', linestyle='--', label='Quadratic Best Fit')
    axes[1].set_xlabel(index)
    axes[1].set_ylabel('Runtime (s)')
    axes[1].set_title(f'Runtime vs {index}')
    axes[1].legend()

#### Initializing sub-parameters

Before interaction analysis, we know gene_flow_rate is directly related to islands and boltzmann_temperature is only needed for boltzmann selection. So we find the optimal values for these two parameters first, and set them in the interaction analysis.

**An error occured while testing. You need to rerun this for anything final**

In [None]:
# For each combination of islands and gene flow rate, run the GA and record the results

range_islands=np.linspace(1, 10, 10, dtype=int)
range_gene_flow_rate=np.linspace(0.1, 1, 10, dtype=float)

total_combinations = len(range_islands) * len(range_gene_flow_rate)
current_combination = 0
gene_flow_rate_results = []

for islands in range_islands:
    for gene_flow_rate in range_gene_flow_rate:
        current_combination += 1
        print(f'Testing Combination {current_combination}/{total_combinations}', end='\r')
        error, run_time = test_combination(
            pop_size=100, # Just an initial value
            islands=islands,
            gene_flow_rate=gene_flow_rate
        )

        # Record the results
        gene_flow_rate_results.append({
            'islands': islands,
            'gene_flow_rate': round(gene_flow_rate, 1),
            'error': error,
            'run_time': run_time
        })

gene_flow_rate_results_df = pd.DataFrame(gene_flow_rate_results)


Instructions for updating:
Use tf.identity with explicit device placement instead.


  saveable.load_own_variables(weights_store.get(inner_path))


Testing Combination 97/100

In [None]:
heatmap(gene_flow_rate_results_df, 'islands', 'gene_flow_rate')
scatter_plot(gene_flow_rate_results_df, 'islands', 2)
scatter_plot(gene_flow_rate_results_df, 'gene_flow_rate', 2)

In [None]:
# Find optimal boltzmann_temperature

range_boltzmann_temperature=np.linspace(1, 10, 10, dtype=float)
boltzmann_temperature_results = []

for i, boltzmann_temperature in enumerate(range_boltzmann_temperature):
    print(f'Testing Combination {i}/{len(range_boltzmann_temperature)}', end='\r')
    error, run_time = test_combination(
        pop_size=100, # Just an initial value
        selection='boltzmann',
        boltzmann_temperature=boltzmann_temperature
    )

    # Record the results
    boltzmann_temperature_results.append({
        'boltzmann_temperature': round(boltzmann_temperature, 1),
        'error': error,
        'run_time': run_time
    })
        

boltzmann_temperature_results_df = pd.DataFrame(boltzmann_temperature_results)

In [None]:
scatter_plot(boltzmann_temperature_results, 'boltzmann_temperature', 2)

In [None]:
# Hyperparameter ranges for partial dependence plots. For simplicity, each has 10 values
# gene_flow_rate is set to 0.5 and boltzmann_temperature is set to 1

range_pop_size=np.linspace(50, 500, 10, dtype=int)
range_generations=np.linspace(50, 500, 10, dtype=int)
range_base_mutation_rate=np.linspace(0.1, 1, 10, dtype=float)
range_chromosomes=np.linspace(1, 30, 10, dtype=int) # The mask is 30 nucleotides long
range_islands=np.linspace(1, 10, 10, dtype=int)
range_surval_rate=np.linspace(0.1, 1, 10, dtype=float)
range_num_parents=np.linspace(1, 10, 10, dtype=int)
range_num_competitors=np.linspace(1, 10, 10, dtype=int)
range_selection=['tournament', 'tournament_pop', 'roulette', 'rank_based', 'truncation', 'boltzmann']