# Domain Decoder: A Tool to Identify Fluoroquinolone Drug Binding Sites


## Import Libraries

In [25]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings(action='once')

## Import Data

In [26]:
data_path = "3-gram-final.csv"
sequence_path = "DNA_Gyrase_UniProt_CLEANED.fasta"

model_df = pd.read_csv(data_path, index_col=0, delimiter=',')
print("Model Head:")
print(model_df.head(10))
print("Size of the DataFrame:", model_df.shape)
unique_values = model_df.index.nunique()
print(f"Unique Values: {unique_values}")
sequences = SeqIO.to_dict(SeqIO.parse(sequence_path, "fasta"))
print("\nExample Protein Sequence:\n")
print(str(list(sequences.values())[0].seq))

Model Head:
     -0.202641  -0.44152558  0.7237159  -0.051630057  -0.1796543  -0.17303413  \
AAA                                                                             
ALA  -0.487939    -0.286591   0.723286      0.018458   -0.026013    -0.185017   
LLL  -0.209581    -0.049195   0.687492      0.043283   -0.085327    -0.166621   
LAA  -0.449757    -0.301040   0.650756     -0.038405   -0.143120    -0.177706   
AAL  -0.322143    -0.279880   0.750441      0.031342   -0.126243    -0.134607   
ALL  -0.377936    -0.210822   0.723277      0.042915   -0.042336    -0.184456   
LLA  -0.325802    -0.160612   0.716421      0.140574   -0.105720    -0.169147   
SSS   0.175785    -0.103099   0.445012     -0.116389   -0.366477    -0.205607   
LAL  -0.347601    -0.276936   0.759509      0.073802    0.040129    -0.204613   
EAL  -0.476662    -0.229644   0.596552     -0.157708   -0.179475    -0.120480   
AAG  -0.460274    -0.385322   0.771879      0.123707   -0.182858    -0.174014   

     -0.5336818

## Standardization and Vectorization
* generate_kmers - Generate overlapping k-mers of given length from a Protein sequence
* prot_vectorization - Vectorize Protein k-mers using a model loaded from a CSV file
* standardize_vectors - Standardize the feature vectors
* vectorize_sequences - Vectorize sequences based on a model DataFrame
* check_kmers_in_model - Find k-mers in model dataframe
* get_standardize_vector_sequence - Returns Standardize vectorized sequences

In [32]:
def generate_kmers(prot_sequence, k=3):
    """Generate overlapping k-mers from a Protein sequence."""
    return [prot_sequence[i:i+k] for i in range(len(prot_sequence) - k + 1)]

def prot_vectorization(kmers, model_df, dimensions):
    """Vectorize Protein k-mers using a model loaded from a CSV file."""
    vector = np.zeros((len(kmers), dimensions))
    for i, kmer in enumerate(kmers):
        if kmer in model_df.index:
            vector[i] = model_df.loc[kmer].values
    return np.mean(vector, axis=0)

def standardize_vectors(vectors):
    """Standardize the feature vectors."""
    scaler = StandardScaler()
    standardized_vectors = scaler.fit_transform(vectors)
    return standardized_vectors

def vectorize_sequences(sequences, model_df, k):
    """Vectorize sequences based on a model DataFrame."""
    vectorized_seqs = []
    seq_ids = []
    
    for seq_id, sequence in sequences.items():
        kmers = generate_kmers(str(sequence.seq), k)
        dimensions = model_df.shape[1]
        vec = prot_vectorization(kmers, model_df, dimensions)
        vectorized_seqs.append(vec)
        seq_ids.append(seq_id)
    
    return vectorized_seqs, seq_ids

def check_kmers_in_model(kmers, model_df):
    """Check if k-mers are in the model DataFrame."""
    for kmer in kmers:
        if kmer in model_df.index:
            print(f"{kmer} found in model")
        else:
            print(f"{kmer} NOT found in model")
            
def get_standardize_vector_sequence(vectorized_seqs):
    """Standardize vectorized sequences."""
    vectorized_seqs_np = np.array(vectorized_seqs)
    if len(vectorized_seqs_np.shape) == 1:
        vectorized_seqs_np = np.stack(vectorized_seqs_np, axis=0)
    
    standardized_vectors = standardize_vectors(vectorized_seqs_np)
    return standardized_vectors

In [33]:
#Reads in clustalW MSA and finds conserved regions and generates weights for the embeddings.
from Bio import AlignIO

def get_conservation_weights(alignment_file):
    """
    Parses the Clustal alignment file to determine conservation weights.
    """
    alignment = AlignIO.read(alignment_file, "clustal")
    conservation_line = alignment[-1].seq  # Assuming conservation info is in the last sequence
    
    weights = {}
    for i, symbol in enumerate(conservation_line):
        if symbol == '*':  # Fully conserved
            weights[i] = 3
        elif symbol == ':':  # Strongly conserved
            weights[i] = 2
        elif symbol == '.':  # Weaker conserved
            weights[i] = 1
    
    return weights

# Example usage
alignment_file = "DNA_Gyrase_MSA.txt"
weights = get_conservation_weights(alignment_file)


In [34]:
vectorized_seqs, seq_ids = vectorize_sequences(sequences, model_df, k=3)

if len(vectorized_seqs) == 0 or vectorized_seqs[0].size == 0:
    print("No sequences were vectorized. Check the vectorization process.")
else:
    # Standardize vectors
    standardized_vectors = get_standardize_vector_sequence(vectorized_seqs)
    # Save to CSV
    df = pd.DataFrame(standardized_vectors, index=seq_ids)
    df.to_csv('gyrase_vectorized1.csv')

In [43]:
# weighted embeddings for active site or regions which are conserved and thus play vital roles in the sequence function of DNA Gyrase
def weighted_prot_vectorization(kmers, model_df, dimensions, weights):
    """
    Vectorize Protein k-mers using a model loaded from a CSV file, applying conservation weights.
    """
    vector = np.zeros((len(kmers), dimensions))
    for i, kmer in enumerate(kmers):
        if kmer in model_df.index:
            # Apply conservation weight if the position is conserved; otherwise, use default weight of 1
            weight = np.mean([weights.get(pos, 1) for pos in range(i, i + len(kmer))])
            vector[i] = model_df.loc[kmer].values * weight
    return np.mean(vector, axis=0)

# Integration with existing code
vectorized_seqs, seq_ids = [], []

for seq_id, sequence in sequences.items():
    kmers = generate_kmers(str(sequence.seq), k=3)
    dimensions = model_df.shape[1]
    vec = weighted_prot_vectorization(kmers, model_df, dimensions, weights)  # Now using the modified function
    vectorized_seqs.append(vec)
    seq_ids.append(seq_id)

# Proceed with standardization and further analysis as before
standardized_vectors = get_standardize_vector_sequence(vectorized_seqs)



Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Asus\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_13908\426250307.py", line 20, in <module>
    vec = weighted_prot_vectorization(kmers, model_df, dimensions, weights)  # Now using the modified function
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Asus\AppData\Local\Temp\ipykernel_13908\426250307.py", line -1, in weighted_prot_vectorization
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Asus\anaconda3\Lib\site-packages\IPython\core\interactiveshell.py", line 2120, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Asus\anaconda3\Lib\site-packages\IPython\core\ultratb.py

In [None]:
standardized = pd.read_csv("gyrase_vectorized1.csv", index_col=0, delimiter=',')
print(standardized.head)

In [None]:
#!/bin/bash/env python 
import sys,json,os 
import matplotlib.pyplot as plt 
from Bio.SeqUtils.ProtParam import ProteinAnalysis 
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import pandas as pd 
kmers = [] 
# Parse prot file
with open("gyrase_vectorized1.csv","r") as f:
    for line in f:
        line = line.strip() 
        cols = line.split(',') 
        kmer = cols[0]
        if 'X' in kmer or 'Z' in kmer or 'B' in kmer or '<' in kmer:
            continue 
        kmers.append(kmer)  



X = pd.read_csv("gyrase_vectorized1.csv",header=None)
print(X)

kmer_names = X.iloc[:,0].values
X = X.iloc[:, 1:len(X.columns)-1]
X = X.values 

# t-distributed Stochastic Neighbor Embedding. (Like PCA but based on similarity not covariance)
print('-- Fitting TSNE')
pca = TSNE(n_components=2)
X_trans = pca.fit_transform(X)


print("original shape:   ", X.shape)
print("transformed shape:", X_trans.shape)

plt.scatter(X_trans[:, 1], X_trans[:, 0], alpha=0.5,s=4)
df = pd.DataFrame(X_trans,index=kmer_names)
df.to_csv("3-gram-dbtx.model")
plt.show()


In [37]:
from transformers import BertTokenizer
from Bio import SeqIO

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)

# Function to tokenize a sequence
def tokenize_sequences(sequence_path):
    tokenized_sequences = []
    for record in SeqIO.parse(sequence_path, "fasta"):
        sequence = str(record.seq)
        # Tokenize sequence
        encoded_sequence = tokenizer.encode(sequence, add_special_tokens=True, truncation=True, max_length=512)
        tokenized_sequences.append(encoded_sequence)
    return tokenized_sequences

# Tokenize your protein sequences
sequence_path = "DNA Gyrase_Uniprot.fasta"
tokenized_sequences = tokenize_sequences(sequence_path)


FileNotFoundError: [Errno 2] No such file or directory: 'DNA Gyrase_Uniprot.fasta'

In [None]:
import pandas as pd

# Load the pre-computed vectors
vectors_path = "gyrase_vectorized1.csv"
vectors_df = pd.read_csv(vectors_path, index_col=0)

# Convert DataFrame to a tensor or array for use with PyTorch or other libraries
import torch
vectors_tensor = torch.tensor(vectors_df.values)


In [None]:
from torch.utils.data import Dataset, DataLoader

class ProteinDataset(Dataset):
    def __init__(self, tokenized_sequences, vectors_tensor):
        self.tokenized_sequences = tokenized_sequences
        self.vectors_tensor = vectors_tensor
        
    def __len__(self):
        return len(self.tokenized_sequences)
    
    def __getitem__(self, idx):
        item = {
            "input_ids": torch.tensor(self.tokenized_sequences[idx], dtype=torch.long),
            "vectors": self.vectors_tensor[idx]
        }
        return item


dataset = ProteinDataset(tokenized_sequences, vectors_tensor)

loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
from transformers import BertModel, BertTokenizer
import torch
import torch.nn as nn

# Initialize ProtBert
tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
protbert = BertModel.from_pretrained("Rostlab/prot_bert")

class HybridModel(nn.Module):
    def __init__(self, bert_model, custom_embedding_dim, hidden_dim, output_dim):
        super(HybridModel, self).__init__()
        self.bert_model = bert_model
        self.concat_layer = nn.Linear(bert_model.config.hidden_size + custom_embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.classifier = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask, custom_embeddings):
        # Get embeddings from ProtBert
        with torch.no_grad():
            outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)
            sequence_output = outputs.last_hidden_state[:, 0, :]  # Using the [CLS] token
        
        # Concatenate ProtBert embeddings with custom embeddings
        combined_features = torch.cat((sequence_output, custom_embeddings), dim=1)
        
        # Pass through additional layers
        x = self.concat_layer(combined_features)
        x = self.relu(x)
        x = self.classifier(x)
        return x
