#### OLD Figure S3
---------

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind
import numpy as np
from itertools import combinations, chain

def get_file_mse_effect(file_names, data_for_plot):
    all_combos = list(chain.from_iterable(combinations(file_names, i) for i in range(1, len(file_names) + 1)))
    file_mse = {tuple(sorted(combo)): None for combo in all_combos}
    
    for combo_str, mse in [(row[2], row[1]) for row in data_for_plot]:
        combo_key = tuple(sorted(combo_str.split(', ')))
        file_mse[combo_key] = mse

    mse_effect = {file: [] for file in file_names}
    
    for combo, value in file_mse.items():
        for file in file_names:
            if file in combo:
                continue
            extended_combo = tuple(sorted(combo + (file,)))
            mse_diff = value - file_mse.get(extended_combo, 0)
            mse_effect[file].append(mse_diff)
    
    return file_mse, mse_effect

def group_mse_by_count(file_mse, file_of_interest, total_files):
    grouped = {True: {i: [] for i in range(1, total_files + 1)},
               False: {i: [] for i in range(1, total_files + 1)}}
    
    for combo, mse in file_mse.items():
        included = file_of_interest in combo
        grouped[included][len(combo)].append(mse)
    
    return grouped[True], grouped[False]

def plot_mse_distributions(include_mse, exclude_mse, start_from, end_at, file_of_interest):
    num_plots = end_at - start_from + 1
    n_cols = 2
    n_rows = (num_plots + n_cols - 1) // n_cols

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 6, n_rows * 5))
    axes = axes.flatten()

    for idx, i in enumerate(range(start_from, end_at + 1)):
        inc, exc = include_mse[i], exclude_mse[i]

        if len(inc) > 1 and len(exc) > 1:
            t_stat, p_val = ttest_ind(inc, exc, equal_var=False)
            sig = " (Significant)" if p_val < 0.05 else ""
            print(f"{i} Files: t-stat = {t_stat:.4f}, p = {p_val:.4f}{sig}")
        else:
            print(f"{i} Files: Not enough data for t-test.")

        sns.kdeplot(inc, label='Including', fill=True, warn_singular=False, ax=axes[idx])
        sns.kdeplot(exc, label='Excluding', fill=True, warn_singular=False, ax=axes[idx])

        axes[idx].set_title(f'MSE Distribution for {i} Files')
        axes[idx].set_xlabel('MSE Value')
        axes[idx].set_ylabel('Density')
        axes[idx].set_xlim(-7, 0)
        axes[idx].legend(title='Condition')

    plt.tight_layout()
    plt.show()
    return fig

# Main script
file_names = pd.read_csv('Data/LaFleur_supp.csv')['File Name'].unique()
data_for_plot = pd.read_csv('Data/data_comparison.csv').values
file_of_interest = 'Urtecho et al'
start_from, end_at = 3, 6

file_mse, mse_effect = get_file_mse_effect(file_names, data_for_plot)
include_mse, exclude_mse = group_mse_by_count(file_mse, file_of_interest, len(file_names))
fig = plot_mse_distributions(include_mse, exclude_mse, start_from, end_at or len(file_names) - 1, file_of_interest)

# Save the figure
fig.savefig('Figures/FigureS3_option1.pdf', dpi=300, bbox_inches='tight')


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Load file names
file_names = pd.read_csv('Data/LaFleur_supp.csv')['File Name'].unique()
target_includes = ['La Fleur et al', 'Urtecho et al']
file_names = [f for f in file_names if f in target_includes]

data_df = pd.read_csv('Data/data_comparison.csv')
data = data_df[['Number of Files', 'MSE', 'Name']].values

fig, ax = plt.subplots(figsize=(10, 6))

colors = {
    'La Fleur et al': 'cornflowerblue',
    'Urtecho et al': 'mediumseagreen'
}

x_all = sorted(set(int(n) for n, _, _ in data))

# Scatter plot
##scatter_data = [(int(n), l) for n, l, _ in data]
##x_vals, y_vals = zip(*scatter_data)
##ax.scatter(x_vals, y_vals, s=100, alpha=0.2, color='grey')

# Boxplots
labeled=True
for x in x_all:
    y_vals = [l for n, l, combo in data if int(n) == x]
    ax.boxplot([y_vals], positions=[x], widths=0.3, patch_artist=True,
                   boxprops=dict(facecolor='grey'),
                   medianprops=dict(color='black'), 
                   label="All combinations" if labeled else None)
    labeled=False
    for i, file_name in enumerate(target_includes):
        y_vals = [l for n, l, combo in data if int(n) == x and file_name in str(combo)]
        if not y_vals:
            continue

        # Offset x slightly for visual separation
        offset = -0.25 if i == 0 else 0.25
        ax.boxplot([y_vals], positions=[x + offset], widths=0.2, patch_artist=True,
                   boxprops=dict(facecolor=colors[file_name]), showfliers=False,
                   medianprops=dict(color='black'),
                   label=f"With {file_name}" if x == x_all[0] else None)

ax.set_xticks(x_all)
ax.set_xticklabels(x_all)

ax.set_xlabel('Number of Datasets')
ax.set_ylabel('log10(MSE)')
ax.legend()

plt.tight_layout()
plt.show()

# Save the figure
fig.savefig('Figures/Figure2.pdf', dpi=300, bbox_inches='tight')


### OLD Saliency ploy

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import load_model  # type: ignore

# -------------------- Data Preprocessing Functions --------------------

def combine_columns(df):
    X = df['Promoter Sequence'].astype(str)
    y = df['Normalized Observed log(TX/Txref)']
    return X, y

def padded_one_hot_encode(seq):
    mapping = {'A': [1,0,0,0], 'C': [0,1,0,0], 'G': [0,0,1,0], 'T': [0,0,0,1], '0': [0,0,0,0]}
    return [mapping[n.upper()] for n in seq]

def preprocess_sequences(X, max_length=150):
    return np.array([padded_one_hot_encode(seq.zfill(max_length)) for seq in X])


# ----------------------- Generate Saliency Maps -----------------------

def generate_saliency_map(model, sequence):
    input_tensor = tf.convert_to_tensor(sequence[np.newaxis, ...], dtype=tf.float32)
    with tf.GradientTape() as tape:
        tape.watch(input_tensor)
        prediction = model(input_tensor)[0, 0]
    gradient = tape.gradient(prediction, input_tensor)
    gradient = tf.norm(gradient, axis=-1)
    return (gradient / tf.reduce_max(gradient)).numpy()

def plot_saliency_map_grid(
    model_filename,
    data,
    num_samples=100,
    random_state=42,
    sort_by_prediction=False,
    title=None,
    colorbar=False,
):
    model = load_model(model_filename)

    sequences = data.sample(n=min(num_samples, len(data)), random_state=random_state)['Promoter Sequence']
    sequences = preprocess_sequences(sequences)

    saliency_maps = []
    predictions = []

    for seq in sequences:
        pred = model(tf.convert_to_tensor(seq[np.newaxis, ...], dtype=tf.float32))[0, 0].numpy()
        saliency = np.abs(generate_saliency_map(model, seq))
        saliency = np.nan_to_num(saliency)
        predictions.append(pred)
        saliency_maps.append(saliency)

    if sort_by_prediction:
        saliency_maps = [saliency_maps[i] for i in np.argsort(predictions)]

    saliency_matrix = np.vstack(saliency_maps)

    im = plt.imshow(
        saliency_matrix, 
        cmap='magma', 
        aspect='auto', 
        vmin=saliency_matrix.min(), 
        vmax=saliency_matrix.max()
    )
    if colorbar:
        plt.colorbar(im, label='Gradient Saliency')
    plt.xticks([]); plt.yticks([])
    if title:
        plt.title(title)
    plt.tight_layout()

    # Save the figure
    plt.savefig('Figures/Figure3.pdf', dpi=300, bbox_inches='tight')

    plt.show()

data = pd.read_csv('Data/LaFleur_supp.csv')

plot_saliency_map_grid(
    model_filename='Models/CNN_6_1_2.keras',
    data=data,
    num_samples=100,
    random_state=1,
    sort_by_prediction=True,
    colorbar=True,
)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('Data/repeat_evalute_each_file.csv', index_col=0)

file_names = []
cv_self = []
cv_all = []

for i in range(0, len(df.columns), 2):
    col_self = df.columns[i]
    col_all = df.columns[i+1]
    file_name = col_self.replace(' (Self)', '')

    self_values = df[col_self]
    all_values = df[col_all]

    cv_s = self_values.std() / self_values.mean() if not self_values.empty else None
    cv_a = all_values.std() / all_values.mean() if not all_values.empty else None

    file_names.append(file_name)
    cv_self.append(cv_s)
    cv_all.append(cv_a)

cv_df = pd.DataFrame({
    "File Name": file_names,
    "Coefficient of Variation (Self)": cv_self,
    "Coefficient of Variation (All)": cv_all
})

# Merge with file name counts
file_name_counts = pd.read_csv('Data/LaFleur_supp.csv')['File Name'].value_counts()
cv_df = cv_df.merge(file_name_counts.rename_axis('File Name').reset_index(), on='File Name')

# Rename, reorder columns
cv_df.rename(columns={'File Name' : 'Dataset', 'count': 'Dataset Size'}, inplace=True)
cv_df = cv_df[['Dataset', 'Dataset Size', 'Coefficient of Variation (Self)', 'Coefficient of Variation (All)']]

# Save the DataFrame to a CSV file
cv_df.to_csv('Figures/Table1.csv', index=False)


cv_df