In [1]:
import numpy as np
import random
from sklearn.model_selection import train_test_split

In [None]:
def read_fasta(input_f):
    sequences = []
    current_seq = ""
    with open(input_f, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if current_seq:
                    sequences.append(current_seq)
                    current_seq = ""
            else:
                current_seq += line
        if current_seq:
            sequences.append(current_seq)
    return sequences

def select_random_sequences(sequences, num_samples=10000):
    if len(sequences) < num_samples :
        raise ValueError("The number of requested sequences exceeds the initial list size.")

    return random.sample(sequences, num_samples)


def encode_sequences_one_hot_with_gap(sequences, max_length=None) :

    amino_acids = '-ACDEFGHIKLMNPQRSTVWY'
    aa_to_idx = {aa: idx for idx, aa in enumerate(amino_acids)}
    valid_sequences = [seq for seq in sequences if all(aa in aa_to_idx for aa in seq)]

    if not valid_sequences :  # Return empty array if no valid sequences remain
        return np.array([])

    # Set maximum length
    if max_length is None :
        max_length = max(len(seq) for seq in valid_sequences)

    # Initialize the output matrix with zeros
    M = len(valid_sequences)
    L = max_length
    encoded_matrix = np.zeros((M, L, len(amino_acids)), dtype=np.float32)

    # Encode each sequence
    for i, seq in enumerate(valid_sequences) :
        for j, aa in enumerate(seq[:max_length]) :  # Truncate sequences longer than max_length
            encoded_matrix[i, j, aa_to_idx[aa]] = 1.0  # One-hot encode valid amino acids and gaps
    return encoded_matrix

# --------------------------------------------------------------------------------------
seq_list = read_fasta('PF00069_noinserts_gaps_noduplicates.fasta')

N = 15000

small_batch_seq_list = select_random_sequences(seq_list,  N)

encoded_matrix = encode_sequences_one_hot_with_gap(small_batch_seq_list, max_length=None)

In [11]:
M, L, A = encoded_matrix.shape  # M: number of sequences, L: sequence length, A: alphabet size (21)
flattened_matrix = encoded_matrix.reshape(M, L*A)  # Shape: (M, L * 21)
print(flattened_matrix.shape)

(14945, 5523)


In [12]:
# First, split into training+validation and test (e.g., 85% for training+validation, 15% for test)
X_train_val, X_test = train_test_split(flattened_matrix, test_size=0.15, random_state=42)

# Now split training+validation into training and validation (e.g., 82.35% training, 17.65% validation)
# so that overall it is (70% train, 15% val, 15% test)
X_train, X_val = train_test_split(X_train_val, test_size=0.1765, random_state=42)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)


Train shape: (10460, 5523)
Validation shape: (2243, 5523)
Test shape: (2242, 5523)


In [13]:
# Save the data to disk using numpy's savez
np.savez('kinase_data_splits.npz', X_train=X_train, X_val=X_val, X_test=X_test)

### laoding

In [None]:
data = np.load('kinase_data_splits.npz')
X_train = data['X_train']
X_val = data['X_val']
X_test = data['X_test']
print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)
