## Physiochemical Property Based Encoding: BLOSUM

Refer to: https://github.com/not-a-feature/blosum
To install blosum: pip install blosum

In [None]:
import numpy as np
import pandas as pd

In [None]:
import blosum as bl
blosum62 = bl.BLOSUM(62)

In [None]:
#test the matrix
val = blosum62["A"]["Y"]
val

In [None]:
def encode_sequence_blosum(sequence, matrix):
    """
    Encode a protein sequence using pre-assigned BLOSUM62 scores.
    
    Parameters:
        sequence (str): Protein sequence
        matrix (dict): BLOSUM substitution matrix (Biopython)
    
    Returns:
        np.array: Encoded sequence vector
    """
    encoding = []
    for aa in sequence:
        if (aa, aa) in matrix:  # Self-substitution score
            encoding.append(matrix[(aa, aa)])
        else:
            encoding.append(0)  # Default score for unknown amino acids
    
    return np.array(encoding)

# Example sequence
sequence = "ACDE"

# Encode using BLOSUM62
encoded_sequence = encode_sequence_blosum(sequence, blosum62)

print("Encoded Sequence:", encoded_sequence)


In [None]:
df = pd.read_csv("ExampleData.csv")

# Apply BLOSUM encoding to all sequences
df["encoded_sequence"] = df["Sequence"].apply(lambda seq: encode_sequence_blosum(seq, blosum62))

# Convert to NumPy array for ML models
X = np.array(df["encoded_sequence"].tolist())

print("Encoded dataset shape:", X.shape)  # (num_sequences, sequence_length)

df["Sequence"] = df["Sequence"].astype(str).apply(lambda x: x.replace("\x00", "").strip())

# Apply BLOSUM62 encoding to all sequences
df["encoded_sequence"] = df["Sequence"].apply(lambda seq: encode_sequence_blosum(seq, blosum62))

# Save encoded sequences as a new dataset
output_file = "blosum_encoded_dataset.csv"
df.to_csv(output_file, index=False)


In [None]:
# Define valid amino acids in BLOSUM62
VALID_AA = set("ACDEFGHIKLMNPQRSTVWY")

# Remove sequences with unknown amino acids
df = df[df["Sequence"].apply(lambda seq: all(aa in VALID_AA for aa in seq))]

print("Filtered dataset shape:", df.shape)


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("ExampleData.csv")

# Apply BLOSUM62 encoding
df["encoded_sequence"] = df["Sequence"].apply(lambda seq: encode_sequence_blosum(seq, blosum62))

# Save encoded dataset
df.to_csv("blosum_encoded_fixed.csv", index=False)


In [None]:
df2=pd.read_csv("blosum_encoded_fixed.csv")
df2

In [None]:
import matplotlib.pyplot as plt

# Compute average BLOSUM score per sequence
df["avg_blosum"] = df["encoded_sequence"].apply(lambda x: np.mean(x))

# Plot histogram of average BLOSUM scores
plt.figure(figsize=(8, 5))
plt.hist(df["avg_blosum"], bins=30, color="blue", alpha=0.7)
plt.xlabel("Average BLOSUM Score")
plt.ylabel("Number of Sequences")
plt.title("Distribution of Average BLOSUM Encoded Sequence Scores")
plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
# Convert sequences into a matrix for visualization
blosum_matrix = np.array(df["encoded_sequence"].tolist())

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(blosum_matrix, cmap="viridis", cbar=True, xticklabels=False, yticklabels=False)

plt.xlabel("Amino Acid Position")
plt.ylabel("Protein Sequences")
plt.title("BLOSUM62 Encoded Protein Sequences - Heatmap")
plt.show()