In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings(action='once')

def generate_kmers(prot_sequence, k=3):
    """Generate overlapping k-mers from a Protein sequence."""
    return [prot_sequence[i:i+k] for i in range(len(prot_sequence) - k + 1)]

def prot_vectorization(kmers, model_df, dimensions):
    """Vectorize Protein k-mers using a model loaded from a CSV file."""
    vector = np.zeros((len(kmers), dimensions))
    for i, kmer in enumerate(kmers):
        if kmer in model_df.index:
            vector[i] = model_df.loc[kmer].values
    return np.mean(vector, axis=0)


def standardize_vectors(vectors):
    """Standardize the feature vectors."""
    scaler = StandardScaler()
    standardized_vectors = scaler.fit_transform(vectors)
    return standardized_vectors

def main():
    # Load your pre-trained k-mer vectorization model from a CSV file
    model_df = pd.read_csv('protVec_100d_3grams.csv', index_col=0, delimiter='\t') 

    print(model_df.head())

    
    sequences = SeqIO.to_dict(SeqIO.parse("rcsb_pdb_6RKS.fasta", "fasta"))
    vectorized_seqs = []
    seq_ids = []
    
    for seq_id, sequence in sequences.items():
        kmers = generate_kmers(str(sequence.seq), k=3)
        dimensions = model_df.shape[1]
        vec = prot_vectorization(kmers, model_df, dimensions)
        vectorized_seqs.append(vec)
        seq_ids.append(seq_id)
    
    # Example 3-mers from the first sequence
    example_kmers = generate_kmers(str(list(sequences.values())[0].seq), k=3)
    print(example_kmers[:10])  # Print the first 10 3-mers

    # Check if these 3-mers are in the model
    for kmer in example_kmers[:10]:
        if kmer in model_df.index:
            print(f"{kmer} found in model")
        else:
            print(f"{kmer} NOT found in model")

    vectorized_seqs_np = np.array(vectorized_seqs)
    if len(vectorized_seqs_np.shape) == 1:  # If it's a 1D array of objects (vectors), convert it
        vectorized_seqs_np = np.stack(vectorized_seqs_np, axis=0)  # Stack vectors into a 2D array

    standardized_vectors = standardize_vectors(vectorized_seqs_np)

    
    # Save to CSV
    df = pd.DataFrame(standardized_vectors, index=seq_ids)
    df.to_csv('gyrase_vectorized.csv')

    # Ensure there are vectorized sequences to standardize
    if len(vectorized_seqs) == 0 or vectorized_seqs[0].size == 0:
        print("No sequences were vectorized. Check the vectorization process.")
        return


if __name__ == "__main__":
    main()


In [None]:
#ProtBERT can be applied.
""" Hugging Face Transformers: A popular library that provides a wide range of pre-trained models, including BERT, 
for various NLP tasks. While primarily focused on natural language, it can be adapted for biological sequences."""

#then classification or clustering?

