In [1]:
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

2024-02-07 07:41:42.625815: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-07 07:41:42.625837: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-07 07:41:42.626573: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-07 07:41:42.630430: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load lorfs.csv
lorfs_short = pd.read_csv('Data/Test/lorfs_short.csv')
lorfs_medium = pd.read_csv('Data/Test/lorfs_medium.csv')
lorfs_long = pd.read_csv('Data/Test/lorfs_long.csv')

# sort by length
lorfs_short = lorfs_short.sort_values(by='length')
lorfs_medium = lorfs_medium.sort_values(by='length')
lorfs_long = lorfs_long.sort_values(by='length')

In [3]:
def filter_non_aa(df):
    non_amino_acids = ['\*','x','0','1',';','5','4','7','8','9','>','_','\.','2','\-','3','6']
    pattern = '|'.join(non_amino_acids)

    print("Original shape:", df.shape)
    orginal_shape = df.shape
    # Make sure the sequence column doesn't contain any of these characters
    df = df[~df['Sequence'].str.contains(pattern, regex=True, case=False, na=False)]
    # Print new shape
    print("New shape after filtering:", df.shape)
    print("Number of removed rows:", orginal_shape[0] - df.shape[0])
    print("\n")
    return df

In [4]:

def one_hot_encode_sequences(df, padding_length):
    df = filter_non_aa(df)
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(df['Sequence'])
    sequences_numeric = tokenizer.texts_to_sequences(df['Sequence'])
    sequences_padded = pad_sequences(sequences_numeric, maxlen=padding_length, padding='post', truncating='post')
    one_hot_sequences = np.zeros((len(sequences_padded), padding_length, (len(tokenizer.word_index) + 1)))
    for i, sequence in enumerate(sequences_padded):
        for j, char_index in enumerate(sequence):
            if char_index != 0:  # Skip padding
                one_hot_sequences[i, j, char_index] = 1
    one_hot_sequences = one_hot_sequences[:,:,1:]
    return one_hot_sequences, df['has_hmm'], pd.DataFrame(df['length'])

In [5]:
# one_hot_encode_sequences
lorfs_short_one_hot, lorfs_short_target, lorfs_short_len = one_hot_encode_sequences(lorfs_short, 100)
lorfs_medium_one_hot, lorfs_medium_target, lorfs_medium_len = one_hot_encode_sequences(lorfs_medium, 400)
lorfs_long_one_hot, lorfs_long_target, lorfs_long_len = one_hot_encode_sequences(lorfs_long, 1000)

# flatten the one hot encoded sequences
lorfs_short_one_hot_flat = lorfs_short_one_hot.reshape(lorfs_short_one_hot.shape[0], -1)
lorfs_medium_one_hot_flat = lorfs_medium_one_hot.reshape(lorfs_medium_one_hot.shape[0], -1)
lorfs_long_one_hot_flat = lorfs_long_one_hot.reshape(lorfs_long_one_hot.shape[0], -1)

Original shape: (11933, 5)
New shape after filtering: (11931, 5)
Number of removed rows: 2




Original shape: (6020, 5)
New shape after filtering: (6019, 5)
Number of removed rows: 1


Original shape: (840, 5)
New shape after filtering: (840, 5)
Number of removed rows: 0




## Models

In [6]:
from sklearn.metrics import accuracy_score

def predict_and_evaluate(models, data, targets):
    """
    Predicts outcomes using the given models and evaluates their accuracy.
    
    Parameters:
    - models: A dictionary of loaded models keyed by dataset size ('short', 'medium', 'long').
    - data: A dictionary containing the preprocessed input data for each model, keyed by dataset size.
    - targets: A dictionary containing the true labels for each dataset size.
    
    Returns:
    - A dictionary containing the accuracy scores keyed by dataset size.
    """
    accuracies = {}
    
    for size in models.keys():
        # Make predictions
        predictions = models[size].predict(data[size])
        
        # Convert predictions to binary
        binary_predictions = [1 if i > 0.5 else 0 for i in predictions.ravel()]
        
        # Calculate and store accuracy
        accuracy = accuracy_score(targets[size], binary_predictions)
        accuracies[size] = accuracy
        print(f'{size.capitalize()} dataset accuracy: {accuracy}')
    
    return accuracies

# Example of how to use the function
# Assuming lorfs_short_one_hot_flat, lorfs_medium_one_hot_flat, lorfs_long_one_hot_flat, 
# lorfs_short_target, lorfs_medium_target, lorfs_long_target are defined

data = {
    'short': lorfs_short_one_hot_flat,
    'medium': lorfs_medium_one_hot_flat,
    'long': lorfs_long_one_hot_flat
}

targets = {
    'short': lorfs_short_target,
    'medium': lorfs_medium_target,
    'long': lorfs_long_target
}



### Dense

In [7]:
# Define the base directory where the models are saved
base_dir = 'Models/Dense'

# Model names based on the dataset size
dataset_sizes = ['short', 'medium', 'long']

# Initialize a dictionary to hold the loaded models
models = {}

# Loop through the dataset sizes, load each model, and add it to the dictionary
for size in dataset_sizes:
    model_name = f'{size}_dataset_model.keras'
    model_path = os.path.join(base_dir, model_name)
    models[size] = load_model(model_path)
    print(f'{size.capitalize()} dataset model loaded from {model_path}')

# Call the function with the models, data, and targets
accuracies = predict_and_evaluate(models, data, targets)


2024-02-07 07:41:48.811492: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-07 07:41:48.829746: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Short dataset model loaded from Models/Dense/short_dataset_model.keras
Medium dataset model loaded from Models/Dense/medium_dataset_model.keras
Long dataset model loaded from Models/Dense/long_dataset_model.keras
Short dataset accuracy: 0.668007711004945
Medium dataset accuracy: 0.6122279448413358
Long dataset accuracy: 0.6083333333333333


### RNN

In [8]:
data = {
    'short': lorfs_short_one_hot,
    'medium': lorfs_medium_one_hot,
    'long': lorfs_long_one_hot
}

base_dir = 'Models/RNN'
dataset_sizes = ['short', 'medium', 'long']

for size in dataset_sizes:
    model_name = f'{size}_dataset_model.keras'
    model_path = os.path.join(base_dir, model_name)
    models[size] = load_model(model_path)
    print(f'{size.capitalize()} dataset shape:', data[size].shape)

accuracies = predict_and_evaluate(models, data, targets)

Short dataset shape: (11931, 100, 20)
Medium dataset shape: (6019, 400, 20)
Long dataset shape: (840, 1000, 20)
Short dataset accuracy: 0.7437767161176766
Medium dataset accuracy: 0.7027745472669878
Long dataset accuracy: 0.42857142857142855


### Logreg

In [9]:
import joblib

data = {
    'short': lorfs_short_len,
    'medium': lorfs_medium_len,
    'long': lorfs_long_len
}

base_dir = 'Models/Logreg'

# Model names based on the dataset size
dataset_sizes = ['short', 'medium', 'long']

# Initialize a dictionary to hold the loaded models
models = {}

# Loop through the dataset sizes, load each model, and add it to the dictionary
for size in dataset_sizes:
    model_name = f'{size}_dataset_model.joblib'
    model_path = os.path.join(base_dir, model_name)
    models[size] = joblib.load(model_path)  # Corrected model loading function
    print(f'{size.capitalize()} dataset model loaded from {model_path}')

# Call the function with the models, data, and targets
accuracies = predict_and_evaluate(models, data, targets)


Short dataset model loaded from Models/Logreg/short_dataset_model.joblib
Medium dataset model loaded from Models/Logreg/medium_dataset_model.joblib
Long dataset model loaded from Models/Logreg/long_dataset_model.joblib
Short dataset accuracy: 0.5474813511021708
Medium dataset accuracy: 0.6745306529323808
Long dataset accuracy: 0.45
