In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
# load lorfs.csv
lorfs_short = pd.read_csv('Data/Test/lorfs_short.csv')
lorfs_medium = pd.read_csv('Data/Test/lorfs_medium.csv')
lorfs_long = pd.read_csv('Data/Test/lorfs_long.csv')

# sort by length
lorfs_short = lorfs_short.sort_values(by='length')
lorfs_medium = lorfs_medium.sort_values(by='length')
lorfs_long = lorfs_long.sort_values(by='length')

In [None]:
def filter_non_aa(df):
    non_amino_acids = ['\*','x','0','1',';','5','4','7','8','9','>','_','\.','2','\-','3','6']
    pattern = '|'.join(non_amino_acids)

    print("Original shape:", df.shape)
    orginal_shape = df.shape
    # Make sure the sequence column doesn't contain any of these characters
    df = df[~df['Sequence'].str.contains(pattern, regex=True, case=False, na=False)]
    # Print new shape
    print("New shape after filtering:", df.shape)
    print("Number of removed rows:", orginal_shape[0] - df.shape[0])
    print("\n")
    return df

In [None]:

def one_hot_encode_sequences(df, padding_length):
    df = filter_non_aa(df)
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(df['Sequence'])
    sequences_numeric = tokenizer.texts_to_sequences(df['Sequence'])
    sequences_padded = pad_sequences(sequences_numeric, maxlen=padding_length, padding='post', truncating='post')
    one_hot_sequences = np.zeros((len(sequences_padded), padding_length, (len(tokenizer.word_index) + 1)))
    for i, sequence in enumerate(sequences_padded):
        for j, char_index in enumerate(sequence):
            if char_index != 0:  # Skip padding
                one_hot_sequences[i, j, char_index] = 1
    one_hot_sequences = one_hot_sequences[:,:,1:]
    return one_hot_sequences, df['has_hmm'], pd.DataFrame(df['length'])

In [None]:
# one_hot_encode_sequences
lorfs_short_one_hot, lorfs_short_target, lorfs_short_len = one_hot_encode_sequences(lorfs_short, 100)
lorfs_medium_one_hot, lorfs_medium_target, lorfs_medium_len = one_hot_encode_sequences(lorfs_medium, 400)
lorfs_long_one_hot, lorfs_long_target, lorfs_long_len = one_hot_encode_sequences(lorfs_long, 1000)

# flatten the one hot encoded sequences
lorfs_short_one_hot_flat = lorfs_short_one_hot.reshape(lorfs_short_one_hot.shape[0], -1)
lorfs_medium_one_hot_flat = lorfs_medium_one_hot.reshape(lorfs_medium_one_hot.shape[0], -1)
lorfs_long_one_hot_flat = lorfs_long_one_hot.reshape(lorfs_long_one_hot.shape[0], -1)

## Models

In [None]:
from sklearn.metrics import accuracy_score

def predict_and_evaluate(models, data, targets):
    """
    Predicts outcomes using the given models and evaluates their accuracy.
    
    Parameters:
    - models: A dictionary of loaded models keyed by dataset size ('short', 'medium', 'long').
    - data: A dictionary containing the preprocessed input data for each model, keyed by dataset size.
    - targets: A dictionary containing the true labels for each dataset size.
    
    Returns:
    - A dictionary containing the accuracy scores keyed by dataset size.
    """
    accuracies = {}
    
    for size in models.keys():
        # Make predictions
        predictions = models[size].predict(data[size])
        
        # Convert predictions to binary
        binary_predictions = [1 if i > 0.5 else 0 for i in predictions.ravel()]
        
        # Calculate and store accuracy
        accuracy = accuracy_score(targets[size], binary_predictions)
        accuracies[size] = accuracy
        print(f'{size.capitalize()} dataset accuracy: {accuracy}')
    
    return accuracies

# Example of how to use the function
# Assuming lorfs_short_one_hot_flat, lorfs_medium_one_hot_flat, lorfs_long_one_hot_flat, 
# lorfs_short_target, lorfs_medium_target, lorfs_long_target are defined

data = {
    'short': lorfs_short_one_hot_flat,
    'medium': lorfs_medium_one_hot_flat,
    'long': lorfs_long_one_hot_flat
}

targets = {
    'short': lorfs_short_target,
    'medium': lorfs_medium_target,
    'long': lorfs_long_target
}



### Dense

In [None]:
# Define the base directory where the models are saved
base_dir = 'Models/Dense'

# Model names based on the dataset size
dataset_sizes = ['short', 'medium', 'long']

# Initialize a dictionary to hold the loaded models
models = {}

# Loop through the dataset sizes, load each model, and add it to the dictionary
for size in dataset_sizes:
    model_name = f'{size}_dataset_model.keras'
    model_path = os.path.join(base_dir, model_name)
    models[size] = load_model(model_path)
    print(f'{size.capitalize()} dataset model loaded from {model_path}')

# Call the function with the models, data, and targets
accuracies = predict_and_evaluate(models, data, targets)


### Logreg

In [None]:
import joblib

data = {
    'short': lorfs_short_len,
    'medium': lorfs_medium_len,
    'long': lorfs_long_len
}

# Define the base directory where the models are saved
base_dir = 'Models/Logreg'

# Model names based on the dataset size
dataset_sizes = ['short', 'medium', 'long']

# Initialize a dictionary to hold the loaded models
models = {}

# Loop through the dataset sizes, load each model, and add it to the dictionary
for size in dataset_sizes:
    model_name = f'{size}_dataset_model.joblib'
    model_path = os.path.join(base_dir, model_name)
    models[size] = joblib.load(model_path)  # Corrected model loading function
    print(f'{size.capitalize()} dataset model loaded from {model_path}')

# Call the function with the models, data, and targets
accuracies = predict_and_evaluate(models, data, targets)
