In [1]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# set the seed
np.random.seed(1337)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-02-07 01:55:38.248798: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-07 01:55:38.248822: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-07 01:55:38.249352: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory 

In [2]:
df = pd.read_csv('Data/lorfs.csv')
df.head()

Unnamed: 0,Header,Sequence,length,genome_id,has_hmm
0,NC_010085.1;1000047;1000160;+,MSANMGLSIEGIPSFGGPPPLILSKGIRCLMPSVSII,37,GCF_000018465.1_ASM1846v1,0
1,NC_010085.1;1000357;1000449;+,MDRIWTNPFNDFSKARINEYRSNPKGKCIT,30,GCF_000018465.1_ASM1846v1,0
2,NC_010085.1;1000687;1000782;+,MGKWIFCFQRRMGMGFSNASCNTCNHSSRAR,31,GCF_000018465.1_ASM1846v1,0
3,NC_010085.1;1000385;1000831;+,MIFQRPESMNTEATLKGNVLHDITHWGISASIGVTFIVHSIKKFDP...,148,GCF_000018465.1_ASM1846v1,1
4,NC_010085.1;1000860;1001009;+,MNFFIFGFGITAPHALSCPPPEPPRSVANKLTILPKCTLQFLEPLTDTA,49,GCF_000018465.1_ASM1846v1,0


In [3]:
low = 100
high = 400
# divide df into 3 dfs based on "lenght"
df1 = df[df['length'] < low]
df2 = df[(df['length'] >= low) & (df['length'] < high)]
df3 = df[df['length'] >= high]

In [4]:
import pandas as pd

# Function to undersample a dataframe and print details
def undersample_dataframe(df, column_name='has_hmm'):
    # Print initial stats
    initial_total = len(df)
    initial_percentage_minority = df[column_name].mean() * 100
    print(f"Before undersampling: {initial_total} rows, {initial_percentage_minority:.2f}% minority class")

    # Separate the majority and minority classes
    minority_class_size = df[column_name].value_counts().min()
    majority_class_size = df[column_name].value_counts().max()

    # Identify the majority and minority class labels
    majority_class_label = df[column_name].value_counts().idxmax()
    minority_class_label = df[column_name].value_counts().idxmin()

    # Split the dataframe into majority and minority
    df_majority = df[df[column_name] == majority_class_label]
    df_minority = df[df[column_name] == minority_class_label]

    # Downsample majority class
    df_majority_downsampled = df_majority.sample(n=minority_class_size, random_state=42)  # Random state for reproducibility

    # Combine minority class with downsampled majority class
    df_balanced = pd.concat([df_majority_downsampled, df_minority])

    # Print final stats
    final_total = len(df_balanced)
    final_percentage_minority = df_balanced[column_name].mean() * 100
    print(f"After undersampling: {final_total} rows, {final_percentage_minority:.2f}% minority class")

    return df_balanced

# Example usage with df1, assuming 'has_hmm' is the column of interest
print("DF1:")
df1 = undersample_dataframe(df1, 'has_hmm')
print("\nDF2:")
df2 = undersample_dataframe(df2, 'has_hmm')
print("\nDF3:")
df3 = undersample_dataframe(df3, 'has_hmm')

DF1:
Before undersampling: 2146532 rows, 7.65% minority class
After undersampling: 328462 rows, 50.00% minority class

DF2:
Before undersampling: 531987 rows, 35.47% minority class
After undersampling: 377392 rows, 50.00% minority class

DF3:
Before undersampling: 106544 rows, 76.83% minority class
After undersampling: 49370 rows, 50.00% minority class


Removing the samples with unclear amino acids.

In [5]:

def filter_non_aa(df):
    non_amino_acids = ['\*','x','0','1',';','5','4','7','8','9','>','_','\.','2','\-','3','6']
    pattern = '|'.join(non_amino_acids)

    print("Original shape:", df.shape)
    orginal_shape = df.shape
    # Make sure the sequence column doesn't contain any of these characters
    df = df[~df['Sequence'].str.contains(pattern, regex=True, case=False, na=False)]
    # Print new shape
    print("New shape after filtering:", df.shape)
    print("Number of removed rows:", orginal_shape[0] - df.shape[0])
    print("\n")
    return df

df1 = filter_non_aa(df1)
df2 = filter_non_aa(df2)
df3 = filter_non_aa(df3)

Original shape: (328462, 5)
New shape after filtering: (328460, 5)
Number of removed rows: 2


Original shape: (377392, 5)
New shape after filtering: (377379, 5)
Number of removed rows: 13


Original shape: (49370, 5)
New shape after filtering: (49362, 5)
Number of removed rows: 8




# One hot encode

In [8]:

def one_hot_encode_sequences(df, padding_length, name):
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(df['Sequence'])
    sequences_numeric = tokenizer.texts_to_sequences(df['Sequence'])
    sequences_padded = pad_sequences(sequences_numeric, maxlen=padding_length, padding='post', truncating='post')
    one_hot_sequences = np.zeros((len(sequences_padded), padding_length, (len(tokenizer.word_index) + 1)))
    for i, sequence in enumerate(sequences_padded):
        for j, char_index in enumerate(sequence):
            if char_index != 0:  # Skip padding
                one_hot_sequences[i, j, char_index] = 1
    one_hot_sequences = one_hot_sequences[:,:,1:]
    X_train, X_test, y_train, y_test = train_test_split(one_hot_sequences, df['has_hmm'], test_size=0.2)
    np.savez_compressed('Data/' + name + '.npz', X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test)
    np.savez_compressed('Data/' + name + '_meta.npz', name = df['genome_id'], length = df['length'], target = df['has_hmm'])
    return None

In [9]:
one_hot_encode_sequences(df1, 100, 'lorfs_100')
one_hot_encode_sequences(df2, 400, 'lorfs_400')
one_hot_encode_sequences(df3, 1000, 'lorfs_1000')