This notebook provides code to test all models with validation datasets (either held out test sets or external validation datasets).

In [67]:
# import statements 
import sys
sys.path.insert(1, './BioAutoMATED/main_classes/')
sys.path.append('./BioAutoMATED')
from wrapper import run_bioautomated
from integrated_design_helpers import *
from generic_automl_classes import convert_generic_input, read_in_data_file
from generic_deepswarm import print_summary
from transfer_learning_helpers import transform_classification_target, transform_regression_target, fit_final_deepswarm_model
from generic_tpot import reformat_data_traintest
from sklearn.metrics import r2_score
import scipy.stats as sp
from keras.initializers import glorot_uniform
from keras.layers import BatchNormalization
from sklearn.model_selection import train_test_split
import autokeras
import torch
import pickle

In [68]:
result_list = []
for col in ["Gene","Promoter","RBS"]:
    for i in range(1,6):
        data_folder = './dataset/rigorous/'
        data_file = f'test_{col}_group{i}.csv'
        root_path = f"./ckpt/rigorous/{col}_group{i}"
        break
    break

# Data Process

In [69]:
rawdata = pd.read_csv(data_folder+data_file)
# rawdata.at[0, 'Seq'] = rawdata.iloc[0].Seq*5
# rawdata.Seq = rawdata.Seq.apply(lambda x:x[:826])
rawdata.to_csv('./output/experimental_data_fineturn.csv', index=False, encoding = 'utf_8_sig')
rawdata

Unnamed: 0,seq,target
0,CGCGCCTTGACGGCTAGCTCAGTCCTAGGTATTGTGCTAGCCGTCG...,11.688265
1,CGCGCCAAAAAGAGTATTGACTTCGCATCTTTTTGTACCCATAATT...,12.008913
2,CGCGCCTTGACATAAAGTCTAACCTATAGGTATAATGTGTGGATCT...,9.565730
3,CGCGCCTTGACAATTAATCATCCGGCTCGTATAATGTGTGGAATTG...,11.556572
4,CGCGCCTTGACGGCTAGCTCAGTCCTAGGTACAGTGCTAGCTTAAT...,9.913603
...,...,...
96,CGCGCCTTTATAGCTAGCTCAGCCCTTGGTACAATGCTAGCGCCTG...,9.061063
97,CGCGCCTTGACATTTATCCCTTGCGGCGATATAATGTGTGGATAAG...,12.504289
98,CGCGCCTTGACATAAAGTCTAACCTATAGGCATAATTATTTCATCC...,10.204993
99,CGCGCCTTGACAGCTAGCTCAGTCCTAGGTATAATGCTAGCACGAA...,10.430272


In [70]:
from sklearn.metrics import r2_score
from scipy.stats import pearsonr, spearmanr

def calculate_metrics(preds, y):
    """
    Calculate 'R2', 'Pearson', and 'Spearman' metrics.

    Parameters:
    - preds: Predicted values
    - y: True values

    Returns:
    - r2: R-squared (R2) score
    - pearson: Pearson correlation coefficient
    - spearman: Spearman rank correlation coefficient
    """
    # R-squared (R2) score
    r2 = r2_score(y, preds)

    # Pearson correlation coefficient
    pearson, _ = pearsonr(preds, y)

    # Spearman rank correlation coefficient
    spearman, _ = spearmanr(preds, y)

    return r2, pearson, spearman

# Example 1: Transfer Learning on a DeepSwarm Model 

In [71]:
# Load DeepSwarm Model and freeze all except last two layers (randomly chose this - feel free to customize)
final_model_path = f'{root_path}/outputs/deepswarm/regression/'
final_model_name = 'deepswarm_deploy_model.h5'
# get sequences with help from https://stackoverflow.com/questions/53183865/unknown-initializer-glorotuniform-when-loading-keras-model
with CustomObjectScope({'GlorotUniform': glorot_uniform(), 'BatchNormalizationV1': BatchNormalization()}): # , 'BatchNormalizationV1': BatchNormalization()
    model = tf.keras.models.load_model(final_model_path + final_model_name)
print(model.summary())
print('model is originally trainable: ' + str(model.trainable))
print('number of layers in the model: ' + str(len(model.layers)))

# set all layers except last two dense ones to be fixed
for layer_idx, layer in enumerate(model.layers):
    if layer_idx > len(model.layers) - 3:
        print(str(layer_idx) + ': ' + str(layer) + ', keeping trainable = ' + str(layer.trainable))
    else:
        layer.trainable = False
        print(str(layer_idx) + ': ' + str(layer) + ', setting trainable to ' + str(layer.trainable))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
1710078614.194792 (InputLaye (None, 248, 4, 1)         0         
_________________________________________________________________
1710078614.195567 (Conv2D)   (None, 248, 4, 8)         208       
_________________________________________________________________
1710078614.2090578 (Flatten) (None, 7936)              0         
_________________________________________________________________
1710078614.2131774 (Dense)   (None, 1)                 7937      
Total params: 8,145
Trainable params: 8,145
Non-trainable params: 0
_________________________________________________________________
None
model is originally trainable: True
number of layers in the model: 4
0: <tensorflow.python.keras.engine.input_layer.InputLayer object at 0x7f4229c4e150>, setting trainable to False
1: <tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7f4229c53b50>, set

In [72]:
# Transform the test set RBS data to fine-tune this model
data_folder = './output/'
data_file = 'experimental_data_fineturn.csv'

# Give inputs for data generation
input_col = 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'
task = 'regression'
model_type = 'deepswarm'

# allows user to interpret model with data not in the original training set
# so apply typical cleaning pipeline
df_data_input, df_data_output, _ = read_in_data_file(data_folder + data_file, input_col, target_col)
    
# format data inputs appropriately for autoML platform    
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_generic_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type, model_type = model_type)

# transform output (target) into bins for classification
transformed_output, transform_obj = transform_regression_target(df_data_output)
    
# now, we have completed the pre-processing needed to feed our data into deepswarm
# deepswarm input: numerical_data_input
# deepswarm output: transformed_output
X = numerical_data_input
y = transformed_output

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [73]:
# 使用微调前的模型进行预测
preds = model.predict(X)
r2, pearson, spearman = calculate_metrics(preds, y)
result_list.append([r2,pearson[0],spearman])
print(f"R2: {r2}")
print(f"Pearson: {pearson}")
print(f"Spearman: {spearman}")

R2: 0.22676253116173395
Pearson: [0.66088369]
Spearman: 0.6971074464428115


# Example 2: Transfer Learning on an AutoKeras Model 

In [81]:
data_folder = './output/'
data_file = 'experimental_data_fineturn.csv'


# Give inputs for data generation
input_col = 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'
task = 'regression'
model_type = 'autokeras'

# allows user to interpret model with data not in the original training set
# so apply typical cleaning pipeline
df_data_input, df_data_output, _ = read_in_data_file(data_folder + data_file, input_col, target_col)
    
# format data inputs appropriately for autoML platform    
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_generic_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type, model_type = model_type)

# Format data inputs appropriately for autoML platform
transformed_output, transform_obj = transform_regression_target(df_data_output)

# now, we have completed the pre-processing needed to feed our data into autokeras
# autokeras input: oh_data_input
# autokeras output: transformed_output
X = oh_data_input
y = transformed_output # don't convert to categorical for autokeras

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [82]:
final_model_path = f'{root_path}/models/autokeras/regression/'
final_model_name = 'optimized_autokeras_pipeline_regression.h5'

In [83]:
clf = autokeras.utils.pickle_from_file(final_model_path+final_model_name)
preds = clf.predict(np.array(X))
r2 = r2_score(np.array(y), preds)
print("R-squared after no retraining: ", r2)
print('Evaluation after no retraining: ', evaluation)
r2, pearson, spearman = calculate_metrics(preds, np.array(y).flatten())
print(f"R2: {r2}")
print(f"Pearson: {pearson}")
print(f"Spearman: {spearman}")

R-squared after no retraining:  0.01829332498048042
Evaluation after no retraining:  0.3801518752715207
R2: 0.01829332498048042
Pearson: 0.367636133405826
Spearman: 0.38268602995511697


# Part 3: Transfer Learning on TPOT Model

In [89]:
# read in data file
data_folder = './output/'
data_file = 'experimental_data_fineturn.csv'

# give inputs for data generation
input_col_name = 'seq'
target_col = 'target'
pad_seqs = 'max'
augment_data = 'none'
sequence_type = 'nucleic_acid'
task = 'regression'
model_type = 'tpot'

# allows user to interpret model with data not in the original training set
# so apply typical cleaning pipeline
df_data_input, df_data_output, _ = read_in_data_file(data_folder + data_file, input_col, target_col)
    
# format data inputs appropriately for autoML platform    
numerical_data_input, oh_data_input, df_data_output, scrambled_numerical_data_input, scrambled_oh_data_input, alph = convert_generic_input(df_data_input, df_data_output, pad_seqs, augment_data, sequence_type, model_type = model_type)

# Format data inputs appropriately for autoML platform
transformed_output, transform_obj = transform_regression_target(df_data_output)

X = numerical_data_input
y = transformed_output # don't convert to categorical for tpot
X, y = reformat_data_traintest(X, y)

Confirmed: All sequence characters are in alphabet
Confirmed: No need to pad or truncate, all sequences same length
Confirmed: No data augmentation requested
Confirmed: Scrambled control generated.


In [90]:
root_path

'./ckpt/rigorous/Gene_group1'

In [91]:
# give inputs for paths
final_model_path = f'{root_path}/outputs/tpot/regression/'
final_model_name = 'final_model_tpot_regression.pkl'
output_folder = final_model_path

with open(final_model_path+final_model_name, 'rb') as file:  
    model = pickle.load(file)

In [93]:

preds = model.predict(X)

r2 = r2_score(np.array(y), preds)
print('Original model on new test data R2 : ', r2)
print('Original model on new test data: ', sp.pearsonr(y, preds))


r2, pearson, spearman = calculate_metrics(preds, np.array(y).flatten())
print(f"R2: {r2}")
print(f"Pearson: {pearson}")
print(f"Spearman: {spearman}")

Original model on new test data R2 :  -0.03797631535662571
Original model on new test data:  (0.49189595318857504, 1.7510188135108316e-07)
R2: -0.03797631535662571
Pearson: 0.49189595318857504
Spearman: 0.5182833962846561


### eval

In [5]:
import pickle
import pandas as pd
with open('output/rigorous.pkl', 'rb') as f:
    data = pickle.load(f)

In [6]:
import pandas as pd

# 转换字典为DataFrame
data = pd.DataFrame.from_dict(data, orient='index', columns=['DeepSwarm', 'AutoKeras', 'Tpot'])
metrics = ["r2", "pearson", "spearman"]

# 创建一个Excel writer对象
writer = pd.ExcelWriter('output/rigorous/rigorous_testset_results.xlsx', engine='xlsxwriter')

for i in range(3):
   # 对每个指标进行处理
   new_data = data.applymap(lambda x: x[i] if isinstance(x, list) else x)
   new_data.index = [x.split("/")[-1] for x in new_data.index]
   
   # 将处理后的数据写入不同的sheet
   new_data.to_excel(writer, sheet_name=metrics[i])

# 保存Excel文件
writer.save()