# Current Model

In [1]:
#imports
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import TextVectorization, BatchNormalization
import numpy as np
import tensorflow as tf
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os
from twilio.rest import Client
import csv
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import load_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
import joblib

In [10]:
def preprocess_data(csv_file):
    data = pd.read_csv(csv_file)

    X = data.drop('c0', axis=1)
    Y = data['c0'].values

    categorical_cols = ['Kmer']
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    X_categorical_encoded = encoder.fit_transform(X[categorical_cols])

    numerical_cols = ['skew', 'ratio']
    scaler_X = StandardScaler()
    X_numerical_scaled = scaler_X.fit_transform(X[numerical_cols])
    
    joblib.dump(encoder, 'encoder.pkl')
    joblib.dump(scaler_X, 'standard_scaler.pkl')

    return X_categorical_encoded, X_numerical_scaled, Y

def create_model(input_shapes):
    input_cat = tf.keras.layers.Input(shape=(input_shapes[0],))
    cat_branch = tf.keras.layers.Dense(128, activation='tanh')(input_cat)
    cat_branch = tf.keras.layers.Dropout(0.2)(cat_branch)

    input_num = tf.keras.layers.Input(shape=(input_shapes[1],))
    num_branch = tf.keras.layers.Dense(128, activation='relu')(input_num)
    num_branch = tf.keras.layers.Dropout(0.2)(num_branch)

    merged = tf.keras.layers.concatenate([cat_branch, num_branch])

    merged = tf.keras.layers.Dense(64, activation='relu')(merged)
    merged = tf.keras.layers.Dropout(0.2)(merged)
    merged = tf.keras.layers.Dense(32, activation='sigmoid')(merged)
    merged = tf.keras.layers.Dropout(0.2)(merged)
    output = tf.keras.layers.Dense(1, activation='linear')(merged)

    model = tf.keras.Model(inputs=[input_cat, input_num], outputs=output)
    return model

def compile_model(model):
    model.compile(optimizer='adam',loss='mse', metrics = ['mae', 'mse'])
    return model

def get_model_layers_info(model):
    model_summary = []
    for layer in model.layers:
        layer_info = {
            "Layer_Name": layer.name,
            "Layer_Type": type(layer).__name__,
            "Number_of_Nodes": layer.units if hasattr(layer, 'units') else None,
            "Activation_Function": layer.activation.__name__ if hasattr(layer, 'activation') else None
        }
        model_summary.append(layer_info)
    return model_summary
    
def write_model_summary_to_csv(model, model_summary, output_file, X_cat_test, X_num_test, Y_test, data_set):
    predictions = model.predict([X_cat_test, X_num_test])
    true_targets = Y_test
    predictions = np.squeeze(predictions)
    true_targets = np.squeeze(true_targets)
    mae = mean_absolute_error(true_targets, predictions)
    mse = mean_squared_error(true_targets, predictions)
    r2 = r2_score(true_targets, predictions)
    print(f"MAE: {mae}, MSE: {mse}, r2: {r2}, data set: {data_set}")
    
    with open(output_file, mode='a', newline='') as file:
        fieldnames = ["Layer_Name", "Layer_Type", "Number_of_Nodes", "Activation_Function"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writerow({"Layer_Name": "MAE", "Layer_Type": str(mae), "Number_of_Nodes": "MSE", "Activation_Function": str(mse)})
        writer.writerow({"Layer_Name": "R^2", "Layer_Type": str(r2), "Number_of_Nodes": "Data:", "Activation_Function": data_set})
        writer.writeheader()
        for layer_info in model_summary:
            writer.writerow(layer_info)

# Load and preprocess data
data_set = 'chrV.csv'
X_cat, X_num, Y = preprocess_data(data_set)
X_cat_train, X_cat_test, X_num_train, X_num_test, Y_train, Y_test = train_test_split(X_cat, X_num, Y, test_size=0.05, random_state=42)


early_stopping = EarlyStopping(monitor='mse', patience=10, restore_best_weights=True)
model = create_model(input_shapes=(X_cat_train.shape[1], X_num_train.shape[1]))
model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])
model.fit([X_cat_train, X_num_train], Y_train, epochs=20, batch_size=64, callbacks=[early_stopping])

model_summary = get_model_layers_info(model)
output_file = "1_model_summary.csv"
write_model_summary_to_csv(model, model_summary, output_file, X_cat_test, X_num_test, Y_test, data_set)


model.save('kmer_model.h5')




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MAE: 0.07146828253474553, MSE: 0.020426804805398664, r2: 0.8834895531739935, data set: chrV.csv


## used for retraining

In [5]:

loaded_model = tf.keras.models.load_model('kmer_model.h5')

# Load and preprocess the new dataset
new_data_set = 'tiling.csv'
X_cat, X_num, Y = preprocess_data(new_data_set)  # Make sure to define your preprocess_data function

# Split new data into training and testing sets
_cat_train, X_cat_test, X_num_train, X_num_test, Y_train, Y_test = train_test_split(X_cat, X_num, Y, test_size=0.05, random_state=42)

# Define early stopping
early_stopping = EarlyStopping(monitor='mse', patience=10, restore_best_weights=True)

# Continue training the loaded model with the new dataset
loaded_model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])
loaded_model.fit([X_cat_train, X_num_train], Y_train, epochs=20, batch_size=64, callbacks=[early_stopping])

model_summary = get_model_layers_info(loaded_model)
output_file = "1_model_summary.csv"
write_model_summary_to_csv(loaded_model, model_summary, output_file, X_cat_test, X_num_test, Y_test, data_set)


loaded_model.save('kmer_model.h5')



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
MAE: 0.37005626701447436, MSE: 0.250283525984047, r2: -0.9467718804215572, data set: chrV.csv


In [7]:

def get_model_layers_info(model):
    model_summary = []
    for layer in model.layers:
        layer_info = {
            "Layer_Name": layer.name,
            "Layer_Type": type(layer).__name__,
            "Number_of_Nodes": layer.units if hasattr(layer, 'units') else None,
            "Activation_Function": layer.activation.__name__ if hasattr(layer, 'activation') else None
        }
        model_summary.append(layer_info)
    return model_summary


def write_model_summary_to_csv(model_summary, output_file, X_tes, Y_tes, data_set):
    predictions = model.predict([X_tes[:, :input_shape_dna], X_tes[:, input_shape_dna:]])
    true_targets = Y_tes
    predictions = np.squeeze(predictions)
    true_targets = np.squeeze(true_targets)
    mae = mean_absolute_error(true_targets, predictions)
    mse = mean_squared_error(true_targets, predictions)
    r2 = r2_score(true_targets, predictions)
    
    with open(output_file, mode='a', newline='') as file:
        fieldnames = ["Layer_Name", "Layer_Type", "Number_of_Nodes", "Activation_Function"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)
                
        writer.writerow({})  # Write an empty row
        writer.writeheader()  # Write the header row
        
        writer.writerow({"Layer_Name": "MAE", "Layer_Type": str(mae), "Number_of_Nodes": "MSE", "Activation_Function": str(mse)})
        writer.writerow({"Layer_Name": "R^2", "Layer_Type": str(r2), "Number_of_Nodes": "Data:", "Activation_Function": data_set})

        
        for layer_info in model_summary:
            writer.writerow(layer_info)

def custom_dna_one_hot_encoder(data_frame, column_name, sequence_length):
    base_to_index = {'A': 1, 'C': 2, 'G': 3, 'T': 4}
    
    encoded_sequences = [
        [base_to_index.get(nucleotide, 0) for nucleotide in sequence]  # Use 0 for padding and unknown bases
        for sequence in data_frame[column_name]
    ]

    # Pad or truncate sequences to the desired length
    for i in range(len(encoded_sequences)):
        if len(encoded_sequences[i]) < sequence_length:
            encoded_sequences[i] += [0] * (sequence_length - len(encoded_sequences[i]))
        else:
            encoded_sequences[i] = encoded_sequences[i][:sequence_length]

    return np.array(encoded_sequences)

data_set = 'all.csv'
data = pd.read_csv(data_set)
X_numerical = data[['skew', 'ratio']]
sequence_length = 6
X_dna_encoded = custom_dna_one_hot_encoder(data, 'Kmer', sequence_length)

poly = PolynomialFeatures(degree=2, include_bias=False)
X_numerical_poly = poly.fit_transform(X_numerical)

# Data Normalization: Scale features to [0, 1] range
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical_poly)
print(X_numerical_scaled)

X_final = np.concatenate([X_dna_encoded, X_numerical_scaled], axis=1)

Y = data['c0'].values

X_train, X_test, Y_train, Y_test = train_test_split(X_final, Y, test_size=0.005, random_state=42)

# Create the TensorFlow model using Keras
def create_model(input_shape):
    # DNA input branch
    dna_inputs = tf.keras.layers.Input(shape=(6,))
    x_dna = tf.keras.layers.Dense(512, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(.01))(dna_inputs)
    x_dna = tf.keras.layers.Dropout(0.2)(x_dna)
    x_dna = tf.keras.layers.Dense(256, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(.01))(x_dna)
    x_dna = tf.keras.layers.Dropout(0.2)(x_dna)
    x_dna = tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(.01))(x_dna)
    x_dna = tf.keras.layers.Dropout(0.2)(x_dna)
    
    # Numerical input branch
    numerical_inputs = tf.keras.layers.Input(shape=(5,))
    x_num = tf.keras.layers.Dense(128, activation='relu')(numerical_inputs)
    x_num = tf.keras.layers.Dense(64, activation = 'linear')(x_num)
    
    
    # Merge branches
    merged = tf.keras.layers.concatenate([x_dna, x_num])
    
    x = tf.keras.layers.Dense(4, activation='sigmoid')(merged)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(1, activation='linear')(x)
    
    model = tf.keras.Model(inputs=[dna_inputs, numerical_inputs], outputs=outputs)
    return model

# Compile and train the model
input_shape_dna = 6
input_shape_num = 5
model = create_model((input_shape_dna, input_shape_num))
model.compile(optimizer='adam', loss='mse', metrics=['mae', 'mse'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='mse', patience=10, restore_best_weights=True)

# Train the model
model.fit([X_train[:, :input_shape_dna], X_train[:, input_shape_dna:]], Y_train,
          epochs=50, batch_size=64, callbacks=[early_stopping])

# Evaluate the model
loss = model.evaluate([X_test[:, :input_shape_dna], X_test[:, input_shape_dna:]], Y_test)
print(f"Test Loss: {loss}")

model_summary = get_model_layers_info(model)
output_file = "1_model_summary.csv"
write_model_summary_to_csv(model_summary, output_file, X_test, Y_test, data_set)


# Save the model
model.save('my_model.h5')

[[ 0.0243812   1.29044695 -0.0557848  -0.00295144  0.09518338]
 [-1.26117282  1.20498682  0.03296369 -0.15362114  0.07735078]
 [-0.94643374  1.20465036 -0.00581976 -0.11674812  0.07728254]
 ...
 [ 0.05762828 -0.24918125 -0.05563261 -0.00581419 -0.07353332]
 [ 0.07979301 -0.21923655 -0.05546259 -0.00556642 -0.07333217]
 [ 0.09087537 -0.25759268 -0.05535702 -0.00578457 -0.07356783]]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Test Loss: [0.12133223563432693, 0.3406246304512024, 0.12130477279424667]
