## One-hot encoding

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from numpy import array

In [None]:
# Define the 20 standard amino acids
AMINO_ACIDS = np.array(list("ACDEFGHIKLMNPQRSTVWY")).reshape(-1, 1)

# Initialize OneHotEncoder (without sparse matrix output)
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(AMINO_ACIDS)  # Fit only on valid amino acids

# Function to one-hot encode a single sequence
def one_hot_encode_sequence(sequence):
    """
    Convert a protein sequence of fixed length 145 into a one-hot encoded matrix.
    
    Parameters:
        sequence (str): Protein sequence of length 145.

    Returns:
        np.array: One-hot encoded matrix of shape (145, 20).
    """
    sequence_array = np.array(list(sequence)).reshape(-1, 1)  # Convert sequence into column format
    encoded_sequence = encoder.transform(sequence_array)  # Apply OneHotEncoder
    return encoded_sequence

# Load dataset (Ensure CSV has a 'sequence' column)
df = pd.read_csv("ExampleData.csv")

# Verify all sequences are of length 145
assert all(df['Sequence'].apply(len) == 145), "Not all sequences have length 145!"

# Apply one-hot encoding to all sequences
encoded_sequences = np.array([one_hot_encode_sequence(seq) for seq in df['Sequence']])

# Reshape into a 2D array (if needed for ML models)
encoded_sequences_flat = encoded_sequences.reshape(encoded_sequences.shape[0], -1)



In [None]:
np.savetxt("one_hot_encoded_proteins.csv", encoded_sequences_flat, delimiter=",")

In [None]:
encoded=pd.read_csv("one_hot_encoded_proteins.csv")

In [None]:
encoded.iloc[:5, : 20]

In [None]:
print(encoded_sequences.shape)  # Should be (num_sequences, 145, 20)

In [None]:
#!pip install Bio
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [None]:
pd.DataFrame([ProteinAnalysis(i).count_amino_acids() for i in df['Sequence']])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Select a small subset of sequences for visualization
num_sequences_to_plot = 15  # Adjust for better readability
subset_df = encoded.iloc[:num_sequences_to_plot]

# Plot heatmap
plt.figure(figsize=(15, 6))
sns.heatmap(subset_df, cmap="viridis", cbar=True, xticklabels=False, yticklabels=True)

plt.xlabel("One-Hot Encoded Features (Amino Acid Positions)")
plt.ylabel("Protein Sequences")
plt.title("Heatmap of One-Hot Encoded Protein Sequences")
plt.savefig('heatmap-one-hot.png')
plt.show()
