In [1]:
from Bio import SeqIO
import pickle
import numpy as np
from Bio import SeqIO
import pandas as pd
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings(action='once')

def generate_kmers(dna_sequence, k=3):
    """Generate overlapping k-mers from a DNA sequence."""
    return [dna_sequence[i:i+k] for i in range(len(dna_sequence) - k + 1)]

def dna_vectorization(kmers, model, dimensions):
    """Vectorize DNA k-mers using a pre-trained model."""
    vector = np.zeros((len(kmers), dimensions))
    for i, kmer in enumerate(kmers):
        if kmer in model:
            vector[i] = model[kmer]
    return np.mean(vector, axis=0)

def standardize_vectors(vectors):
    """Standardize the feature vectors."""
    scaler = StandardScaler()
    standardized_vectors = scaler.fit_transform(vectors)
    return standardized_vectors

def main():
    # Load your pre-trained k-mer vectorization model
    with open('path_to_your_model.pkl', 'rb') as model_file:
        vec_model = pickle.load(model_file)
    
    sequences = SeqIO.to_dict(SeqIO.parse("rcsb_pdb_6RKS.fasta", "fasta"))
    vectorized_seqs = []
    seq_ids = []
    
    for seq_id, sequence in sequences.items():
        kmers = generate_kmers(str(sequence.seq), k=3)  # Adjust k as needed
        vec = dna_vectorization(kmers, vec_model, 100)  # Assuming model vectors have 100 dimensions
        vectorized_seqs.append(vec)
        seq_ids.append(seq_id)
    
    # Standardize vectors
    standardized_vectors = standardize_vectors(np.array(vectorized_seqs))
    
    # Save to CSV
    df = pd.DataFrame(standardized_vectors, index=seq_ids)
    df.to_csv('gyrase_vectorized.csv')

if __name__ == "__main__":
    main()


ID: 6RKS_1|Chains
Description: 6RKS_1|Chains A, C|DNA gyrase subunit A|Escherichia coli K-12 (83333)
Sequence: MSDLAREITPVNIEEELKSSYLDYAMSVIVGRALPDVRDGLKPVHRRVLYAMNVLGNDWNKAYKKSARVVGDVIGKYHPHGDSAVYDTIVRMAQPFSLRYMLVDGQGNFGSIDGDSAAAMRYTEIRLAKIAHELMADLEKETVDFVDNYDGTEKIPDVMPTKIPNLLVNGSSGIAVGMATNIPPHNLTEVINGCLAYIDDEDISIEGLMEHIPGPDFPTAAIINGRRGIEEAYRTGRGKVYIRARAEVEVDAKTGRETIIVHEIPYQVNKARLIEKIAELVKEKRVEGISALRDESDKDGMRIVIEVKRDAVGEVVLNNLYSQTQLQVSFGINMVALHHGQPKIMNLKDIIAAFVRHRREVVTRRTIFELRKARDRAHILEALAVALANIDPIIELIRHAPTPAEAKTALVANPWQLGNVAAMLERAGDDAARPEWLEPEFGVRDGLYYLTEQQAQAILDLRLQKLTGLEHEKLLDEYKELLDQIAELLRILGSADRLMEVIREELELVREQFGDKRRTEITANSADINLEDLITQEDVVVTLSHQGYVKYQPLSEYEAQRRGGKGKSAARIKEEDFIDRLLVANTHDHILCFSSRGRVYSMKVYQLPEATRGARGRPIVNLLPLEQDERITAILPVTEFEEGVKVFMATANGTVKKTVLTEFNRLRTAGKVAIKLVDGDELIGVDLTSGEDEVMLFSAEGKVVRFKESSVRAMGCNTTGVRGIRLGEGDKVVSLIVPRGDGAILTATQNGYGKRTAVAEYPTKSRATKGVISIKVTERNGLVVGAVQVDDCDQIMMITDAGTLVRTRVSEISIVGRNTQGVILIRTAEDENVVGLQRVAEPVDEEDLDTIDGSAAEGDDEIAPEVDVDDEPEEE

ID: 6RKS_2|C