# Domain Decoders Sequence Prediction


## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings(action='once')

## Import Data

In [12]:
data_path = "C:\\Users\\avane\\Desktop\\3-gram-final.csv"
sequence_path = "C:\\Users\\avane\\Desktop\\ecoli.fasta"

model_df = pd.read_csv(data_path, index_col=0, delimiter=',')
print("Model Head:")
print(model_df.head(10))
print("Size of the DataFrame:", model_df.shape)
unique_values = model_df.index.nunique()
print(f"Unique Values: {unique_values}")
sequences = SeqIO.to_dict(SeqIO.parse(sequence_path, "fasta"))
print("\nExample Protein Sequence:\n")
print(str(list(sequences.values())[0].seq))

Model Head:
     -0.202641  -0.44152558  0.7237159  -0.051630057  -0.1796543  -0.17303413  \
AAA                                                                             
ALA  -0.487939    -0.286591   0.723286      0.018458   -0.026013    -0.185017   
LLL  -0.209581    -0.049195   0.687492      0.043283   -0.085327    -0.166621   
LAA  -0.449757    -0.301040   0.650756     -0.038405   -0.143120    -0.177706   
AAL  -0.322143    -0.279880   0.750441      0.031342   -0.126243    -0.134607   
ALL  -0.377936    -0.210822   0.723277      0.042915   -0.042336    -0.184456   
LLA  -0.325802    -0.160612   0.716421      0.140574   -0.105720    -0.169147   
SSS   0.175785    -0.103099   0.445012     -0.116389   -0.366477    -0.205607   
LAL  -0.347601    -0.276936   0.759509      0.073802    0.040129    -0.204613   
EAL  -0.476662    -0.229644   0.596552     -0.157708   -0.179475    -0.120480   
AAG  -0.460274    -0.385322   0.771879      0.123707   -0.182858    -0.174014   

     -0.5336818

  and should_run_async(code)


## Standardization and Vectorization
* generate_kmers - Generate overlapping k-mers of given length from a Protein sequence
* prot_vectorization - Vectorize Protein k-mers using a model loaded from a CSV file
* standardize_vectors - Standardize the feature vectors
* vectorize_sequences - Vectorize sequences based on a model DataFrame
* check_kmers_in_model - Find k-mers in model dataframe
* get_standardize_vector_sequence - Returns Standardize vectorized sequences

In [3]:
def generate_kmers(prot_sequence, k=3):
    """Generate overlapping k-mers from a Protein sequence."""
    return [prot_sequence[i:i+k] for i in range(len(prot_sequence) - k + 1)]

def prot_vectorization(kmers, model_df, dimensions):
    """Vectorize Protein k-mers using a model loaded from a CSV file."""
    vector = np.zeros((len(kmers), dimensions))
    for i, kmer in enumerate(kmers):
        if kmer in model_df.index:
            vector[i] = model_df.loc[kmer].values
    return np.mean(vector, axis=0)

def standardize_vectors(vectors):
    """Standardize the feature vectors."""
    scaler = StandardScaler()
    standardized_vectors = scaler.fit_transform(vectors)
    return standardized_vectors

def vectorize_sequences(sequences, model_df, k):
    """Vectorize sequences based on a model DataFrame."""
    vectorized_seqs = []
    seq_ids = []
    
    for seq_id, sequence in sequences.items():
        kmers = generate_kmers(str(sequence.seq), k)
        dimensions = model_df.shape[1]
        vec = prot_vectorization(kmers, model_df, dimensions)
        vectorized_seqs.append(vec)
        seq_ids.append(seq_id)
    
    return vectorized_seqs, seq_ids

def check_kmers_in_model(kmers, model_df):
    """Check if k-mers are in the model DataFrame."""
    for kmer in kmers:
        if kmer in model_df.index:
            print(f"{kmer} found in model")
        else:
            print(f"{kmer} NOT found in model")
            
def get_standardize_vector_sequence(vectorized_seqs):
    """Standardize vectorized sequences."""
    vectorized_seqs_np = np.array(vectorized_seqs)
    if len(vectorized_seqs_np.shape) == 1:
        vectorized_seqs_np = np.stack(vectorized_seqs_np, axis=0)
    
    standardized_vectors = standardize_vectors(vectorized_seqs_np)
    return standardized_vectors

  and should_run_async(code)


In [4]:
#     # Example 3-mers from the first sequence
#     example_kmers = generate_kmers(str(list(sequences.values())[0].seq), k=3)
#     print(example_kmers[:10])

# Vectorize sequences
vectorized_seqs, seq_ids = vectorize_sequences(sequences, model_df, k=3)

if len(vectorized_seqs) == 0 or vectorized_seqs[0].size == 0:
    print("No sequences were vectorized. Check the vectorization process.")
else:
    # Standardize vectors
    standardized_vectors = get_standardize_vector_sequence(vectorized_seqs)
    # Save to CSV
    df = pd.DataFrame(standardized_vectors, index=seq_ids)
    df.to_csv('C:\\Users\\avane\\Desktop\\gyrase_vectorized1.csv')

In [8]:
standardized = pd.read_csv("C:\\Users\\avane\\Desktop\\gyrase_vectorized1.csv", index_col=0, delimiter=',')
print(standardized.head)

<bound method NDFrame.head of                         0    1    2    3    4    5    6    7    8    9  ...  \
sp|P0AES4|GYRA_ECOLI  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   

                       90   91   92   93   94   95   96   97   98   99  
sp|P0AES4|GYRA_ECOLI  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[1 rows x 100 columns]>


  and should_run_async(code)


In [9]:
#ProtBERT can be applied.
""" Hugging Face Transformers: A popular library that provides a wide range of pre-trained models, including BERT, 
for various NLP tasks. While primarily focused on natural language, it can be adapted for biological sequences."""

#then classification or clustering?



  and should_run_async(code)


' Hugging Face Transformers: A popular library that provides a wide range of pre-trained models, including BERT, \nfor various NLP tasks. While primarily focused on natural language, it can be adapted for biological sequences.'