In [1]:
from tensorflow.keras.models import load_model
import numpy as np
import pandas as pd
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

2024-02-14 08:19:33.413061: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-14 08:19:33.413105: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-14 08:19:33.462483: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-14 08:19:33.563074: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# load lorfs.csv
lorfs_short = pd.read_csv('Data/Test/lorfs_short.csv')
lorfs_medium = pd.read_csv('Data/Test/lorfs_medium.csv')
lorfs_long = pd.read_csv('Data/Test/lorfs_long.csv')

# sort by length
lorfs_short = lorfs_short.sort_values(by='length')
lorfs_medium = lorfs_medium.sort_values(by='length')
lorfs_long = lorfs_long.sort_values(by='length')

def filter_non_aa(df):
    non_amino_acids = ['\*','x','0','1',';','5','4','7','8','9','>','_','\.','2','\-','3','6']
    pattern = '|'.join(non_amino_acids)

    print("Original shape:", df.shape)
    orginal_shape = df.shape
    # Make sure the sequence column doesn't contain any of these characters
    df = df[~df['Sequence'].str.contains(pattern, regex=True, case=False, na=False)]
    # Print new shape
    print("New shape after filtering:", df.shape)
    print("Number of removed rows:", orginal_shape[0] - df.shape[0])
    print("\n")
    return df


def one_hot_encode_sequences(df, padding_length):
    df = filter_non_aa(df)
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(df['Sequence'])
    sequences_numeric = tokenizer.texts_to_sequences(df['Sequence'])
    sequences_padded = pad_sequences(sequences_numeric, maxlen=padding_length, padding='post', truncating='post')
    one_hot_sequences = np.zeros((len(sequences_padded), padding_length, (len(tokenizer.word_index) + 1)))
    for i, sequence in enumerate(sequences_padded):
        for j, char_index in enumerate(sequence):
            if char_index != 0:  # Skip padding
                one_hot_sequences[i, j, char_index] = 1
    one_hot_sequences = one_hot_sequences[:,:,1:]
    return one_hot_sequences, df['has_hmm'], pd.DataFrame(df['length'])

In [3]:
folders = ['Nitrosomonas_europaea', 'Nitrosomonas_ureae', 'Nitrosospira_briensis', 'Candidatus_Methylopumilus_turicensis', 'Aquincola_tertiaricarbonis']  # Adjust as necessary

# Dictionaries to store processed data
processed_data = {
    'short': {},
    'medium': {},
    'long': {}
}

for folder in folders:
    lorfs_short_path = f'Data/Test/{folder}/lorfs_short.csv'
    lorfs_medium_path = f'Data/Test/{folder}/lorfs_medium.csv'
    lorfs_long_path = f'Data/Test/{folder}/lorfs_long.csv'
    
    lorfs_short = pd.read_csv(lorfs_short_path).sort_values(by='length')
    lorfs_medium = pd.read_csv(lorfs_medium_path).sort_values(by='length')
    lorfs_long = pd.read_csv(lorfs_long_path).sort_values(by='length')
    
    processed_short = one_hot_encode_sequences(lorfs_short, 100)
    processed_medium = one_hot_encode_sequences(lorfs_medium, 400)
    processed_long = one_hot_encode_sequences(lorfs_long, 1000)
    
    processed_data['short'][folder] = processed_short
    processed_data['medium'][folder] = processed_medium
    processed_data['long'][folder] = processed_long

# Now `processed_data` contains all the processed data indexed by length category and folder name

Original shape: (25635, 5)
New shape after filtering: (25634, 5)
Number of removed rows: 1


Original shape: (4680, 5)
New shape after filtering: (4680, 5)
Number of removed rows: 0


Original shape: (711, 5)
New shape after filtering: (711, 5)
Number of removed rows: 0


Original shape: (27960, 5)
New shape after filtering: (27959, 5)
Number of removed rows: 1


Original shape: (3405, 5)
New shape after filtering: (3405, 5)
Number of removed rows: 0


Original shape: (762, 5)
New shape after filtering: (762, 5)
Number of removed rows: 0


Original shape: (29928, 5)
New shape after filtering: (29928, 5)
Number of removed rows: 0


Original shape: (5750, 5)
New shape after filtering: (5750, 5)
Number of removed rows: 0


Original shape: (779, 5)
New shape after filtering: (779, 5)
Number of removed rows: 0


Original shape: (13879, 5)
New shape after filtering: (13878, 5)
Number of removed rows: 1


Original shape: (1605, 5)
New shape after filtering: (1605, 5)
Number of removed rows: 0

## Models

In [None]:


def predict_and_evaluate(models, data, targets):
    """
    Predicts outcomes using the given models and evaluates their accuracy.
    
    Parameters:
    - models: A dictionary of loaded models keyed by dataset size ('short', 'medium', 'long').
    - data: A dictionary containing the preprocessed input data for each model, keyed by dataset size.
    - targets: A dictionary containing the true labels for each dataset size.
    
    Returns:
    - A dictionary containing the accuracy scores keyed by dataset size.
    """
    accuracies = {}
    
    for size in models.keys():
        # Make predictions
        predictions = models[size].predict(data[size])
        
        # Convert predictions to binary
        binary_predictions = [1 if i > 0.5 else 0 for i in predictions.ravel()]
        
        # Calculate and store accuracy
        accuracy = accuracy_score(targets[size], binary_predictions)
        accuracies[size] = accuracy
        print(f'{size.capitalize()} dataset accuracy: {accuracy}')
        # confusion matrix
        print(f'{size.capitalize()} dataset confusion matrix:')
        print(confusion_matrix(targets[size], binary_predictions))
        print('\n')
    
    return accuracies

# Example of how to use the function
# Assuming lorfs_short_one_hot_flat, lorfs_medium_one_hot_flat, lorfs_long_one_hot_flat, 
# lorfs_short_target, lorfs_medium_target, lorfs_long_target are defined

data = {
    'short': lorfs_short_one_hot_flat,
    'medium': lorfs_medium_one_hot_flat,
    'long': lorfs_long_one_hot_flat
}

targets = {
    'short': lorfs_short_target,
    'medium': lorfs_medium_target,
    'long': lorfs_long_target
}



In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


# Function to predict and evaluate using the processed_data dictionary
def predict_and_evaluate_with_processed_data(models, processed_data, flat = 1):
    evaluation_results = {}
    
    for size in ['short', 'medium', 'long']:
        size_results = {}
        for folder in processed_data[size]:
            data, targets, _ = processed_data[size][folder]  # Assuming this structure
            if flat:
                data_flat = data.reshape(data.shape[0], -1)
            else:
                data_flat = data
            
            predictions = models[size].predict(data_flat)
            binary_predictions = [1 if i > 0.5 else 0 for i in predictions.ravel()]
            
            accuracy = accuracy_score(targets, binary_predictions)
            confusion_mat = confusion_matrix(targets, binary_predictions)
            
            size_results[folder] = {'accuracy': accuracy, 'confusion_matrix': confusion_mat}
        
        evaluation_results[size] = size_results

    return evaluation_results

def print_summary(all_accuracies):
    for size, results in all_accuracies.items():
        print(f"--- {size.upper()} DATASET RESULTS ---")
        for folder, metrics in results.items():
            print(f"Folder: {folder}")
            print(f"Accuracy: {metrics['accuracy']:.4f}")
            print("Confusion Matrix:")
            print(metrics['confusion_matrix'])
            print("\n")
        print("\n")

In [5]:
# Initialize a dictionary to hold the loaded models
base_dir = 'Models/Dense'
dataset_sizes = ['short', 'medium', 'long']
models = {}

# Loop through the dataset sizes, load each model, and add it to the dictionary
for size in dataset_sizes:
    model_name = f'{size}_dataset_model.keras'
    model_path = os.path.join(base_dir, model_name)
    models[size] = load_model(model_path)
# Call the evaluation function with the loaded models and the processed_data dictionary
all_accuracies = predict_and_evaluate_with_processed_data(models, processed_data)

print_summary(all_accuracies)

2024-02-14 08:21:04.229392: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-14 08:21:04.416462: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


--- SHORT DATASET RESULTS ---
Folder: Nitrosomonas_europaea
Accuracy: 0.5765
Confusion Matrix:
[[13371 10165]
 [  692  1406]]


Folder: Nitrosomonas_ureae
Accuracy: 0.6067
Confusion Matrix:
[[15573  9944]
 [ 1052  1390]]


Folder: Nitrosospira_briensis
Accuracy: 0.5658
Confusion Matrix:
[[15467 12178]
 [  818  1465]]


Folder: Candidatus_Methylopumilus_turicensis
Accuracy: 0.6204
Confusion Matrix:
[[7946 4719]
 [ 549  664]]


Folder: Aquincola_tertiaricarbonis
Accuracy: 0.5749
Confusion Matrix:
[[19058 14126]
 [  957  1337]]




--- MEDIUM DATASET RESULTS ---
Folder: Nitrosomonas_europaea
Accuracy: 0.4737
Confusion Matrix:
[[1241 1385]
 [1078  976]]


Folder: Nitrosomonas_ureae
Accuracy: 0.7941
Confusion Matrix:
[[1123  207]
 [ 494 1581]]


Folder: Nitrosospira_briensis
Accuracy: 0.7165
Confusion Matrix:
[[2814  834]
 [ 796 1306]]


Folder: Candidatus_Methylopumilus_turicensis
Accuracy: 0.6922
Confusion Matrix:
[[267 181]
 [313 844]]


Folder: Aquincola_tertiaricarbonis
Accuracy: 0.602

### RNN

In [10]:
# Initialize a dictionary to hold the loaded models
base_dir = 'Models/RNN'
dataset_sizes = ['short', 'medium', 'long']
models = {}

# Loop through the dataset sizes, load each model, and add it to the dictionary
for size in dataset_sizes:
    model_name = f'{size}_dataset_model.keras'
    model_path = os.path.join(base_dir, model_name)
    models[size] = load_model(model_path)
# Call the evaluation function with the loaded models and the processed_data dictionary
all_accuracies = predict_and_evaluate_with_processed_data(models, processed_data, flat = 0)

print_summary(all_accuracies)

--- SHORT DATASET RESULTS ---
Folder: Nitrosomonas_europaea
Accuracy: 0.6209
Confusion Matrix:
[[14632  8904]
 [  813  1285]]


Folder: Nitrosomonas_ureae
Accuracy: 0.6507
Confusion Matrix:
[[16994  8523]
 [ 1244  1198]]


Folder: Nitrosospira_briensis
Accuracy: 0.6389
Confusion Matrix:
[[17822  9823]
 [  985  1298]]


Folder: Candidatus_Methylopumilus_turicensis
Accuracy: 0.6698
Confusion Matrix:
[[8695 3970]
 [ 612  601]]


Folder: Aquincola_tertiaricarbonis
Accuracy: 0.6460
Confusion Matrix:
[[21787 11397]
 [ 1162  1132]]




--- MEDIUM DATASET RESULTS ---
Folder: Nitrosomonas_europaea
Accuracy: 0.5415
Confusion Matrix:
[[2302  324]
 [1822  232]]


Folder: Nitrosomonas_ureae
Accuracy: 0.4628
Confusion Matrix:
[[1246   84]
 [1745  330]]


Folder: Nitrosospira_briensis
Accuracy: 0.5979
Confusion Matrix:
[[3293  355]
 [1957  145]]


Folder: Candidatus_Methylopumilus_turicensis
Accuracy: 0.2841
Confusion Matrix:
[[ 448    0]
 [1149    8]]


Folder: Aquincola_tertiaricarbonis
Accuracy: 0