In [14]:
import numpy as np

def preprocess_fasta(fasta_file, save_path="processed_sequences.npy"):
    """
    Reads a FASTA file and encodes amino acid sequences into numerical indices.

    Args:
        fasta_file (str): Path to the input FASTA file.
        save_path (str): Path to save the processed sequences as a .npy file.

    Returns:
        sequences_encoded (list): List of encoded sequences.
    """
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    aa_to_idx = {aa: i for i, aa in enumerate(amino_acids)}

    sequences = []
    with open(fasta_file, 'r') as f:
        current_sequence = ""
        for line in f:
            if line.startswith(">"):
                if current_sequence:
                    sequences.append(current_sequence)
                    current_sequence = ""
            else:
                current_sequence += line.strip()
        if current_sequence:
            sequences.append(current_sequence)

    # Encode sequences
    sequences_encoded = []
    for seq in sequences:
        encoded_seq = [aa_to_idx[aa] for aa in seq]
        sequences_encoded.append(encoded_seq)

    # Save to a NumPy file
    np.save(save_path, sequences_encoded)
    print(f"Processed sequences saved to {save_path}")
    return sequences_encoded

# Example usage
fasta_file = "data/rcsb_pdb_4JRB.fasta"
preprocessed_sequences = preprocess_fasta(fasta_file, save_path="preprocessed/4jrb_sequences.npy")


Processed sequences saved to preprocessed/4jrb_sequences.npy


In [15]:
import numpy as np
from Bio.PDB import PDBParser

def extract_coordinates(pdb_file, save_path="processed_coordinates.npy"):
    """
    Extracts 3D atomic coordinates (x, y, z) for alpha carbons from a PDB file.

    Args:
        pdb_file (str): Path to the input PDB file.
        save_path (str): Path to save the extracted coordinates as a .npy file.

    Returns:
        coordinates (list): List of 3D coordinates for each residue.
    """
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_file)

    coordinates = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if "CA" in residue:  # Alpha carbon
                    atom = residue["CA"]
                    coordinates.append(atom.coord)  # (x, y, z)

    # Save coordinates as a NumPy array
    np.save(save_path, np.array(coordinates))
    print(f"Processed coordinates saved to {save_path}")
    return coordinates

# Example usage
pdb_file = "data/4jrb.pdb"
preprocessed_coordinates = extract_coordinates(pdb_file, save_path="preprocessed/4jrb_coordinates.npy")


Processed coordinates saved to preprocessed/4jrb_coordinates.npy


In [16]:
coordinates_file_path = "preprocessed/4jrb_coordinates.npy"
coordinates = np.load(coordinates_file_path, allow_pickle=True)

# Count the number of coordinates
num_coordinates = len(coordinates)
num_coordinates

398

In [17]:
import numpy as np
import json

def combine_data(sequence_file, coordinate_file, save_path="dataset.json"):
    """
    Combines encoded sequences and 3D coordinates into a single dataset and saves as JSON.

    Args:
        sequence_file (str): Path to the .npy file containing encoded sequences.
        coordinate_file (str): Path to the .npy file containing 3D coordinates.
        save_path (str): Path to save the combined dataset as a JSON file.

    Returns:
        dataset (list): List of dictionaries {residue, coordinates}.
    """
    # Load the sequence and coordinates
    sequences = np.load(sequence_file, allow_pickle=True).flatten()  # Ensure sequence is flattened
    coordinates = np.load(coordinate_file, allow_pickle=True)

    # Trim sequence if necessary to match the number of coordinates
    sequence_trimmed = sequences[:len(coordinates)]  # Ensure lengths match

    if len(sequence_trimmed) != len(coordinates):
        raise ValueError("The number of sequences and coordinate entries must match!")

    # Combine the sequence and coordinates into a list of dictionaries
    dataset = [{"residue": int(seq), "coordinates": coord.tolist()} for seq, coord in zip(sequence_trimmed, coordinates)]

    # Save as a JSON file
    with open(save_path, "w") as f:
        json.dump(dataset, f, indent=4)
    print(f"Combined dataset saved to {save_path}")
    return dataset

# Example usage
sequence_file = "preprocessed/4jrb_sequences.npy"
coordinate_file = "preprocessed/4jrb_coordinates.npy"
combined_dataset = combine_data(sequence_file, coordinate_file, save_path="JSON/4jrb_dataset.json")


Combined dataset saved to JSON/4jrb_dataset.json
