## Physiochemical Property Based Encoding: Z scales



In [1]:
import numpy as np
import pandas as pd
from numpy import array

In [None]:
# Define Z-Scale values for 20 standard amino acids (Hellberg et al., 1987)
Z_SCALES = {
    "A": [0.07, -1.73, 0.16, 0.18, -0.11],
    "C": [1.26, -1.57, 0.38, -0.43, -0.21],
    "D": [-0.89, 1.34, -0.30, 0.61, -0.21],
    "E": [-1.68, 1.94, -0.27, 0.37, -0.23],
    "F": [1.52, -1.14, 0.44, -0.99, 1.14],
    "G": [-0.16, -2.46, -0.03, 0.23, 0.15],
    "H": [0.49, 0.88, -0.12, 0.27, 0.23],
    "I": [1.41, -0.84, 0.47, -1.10, 0.31],
    "K": [-1.50, 2.05, 0.30, 1.14, -0.21],
    "L": [1.14, -0.75, 0.40, -1.12, 0.26],
    "M": [0.65, -0.49, 1.30, -0.76, 0.41],
    "N": [-0.75, 1.98, -0.09, 0.14, -0.21],
    "P": [-0.46, 0.27, 0.25, -0.20, 0.14],
    "Q": [-0.73, 1.84, -0.15, 0.11, -0.21],
    "R": [-1.95, 2.44, 0.28, 1.53, -0.21],
    "S": [-0.26, 0.06, -0.11, 0.06, 0.06],
    "T": [-0.30, -0.40, -0.04, -0.32, 0.17],
    "V": [1.13, -0.67, 0.50, -1.09, 0.30],
    "W": [1.85, 0.30, 0.79, -0.71, 2.55],
    "Y": [0.94, 0.65, 0.15, -0.41, 1.61]
}

In [None]:
def encode_sequence_zscale(sequence, zscale_dict):
    """
    Convert a protein sequence into a numerical matrix using Z-Scale encoding.
    
    Parameters:
        sequence (str): Protein sequence.
        zscale_dict (dict): Mapping of amino acids to Z-Scale values.

    Returns:
        np.array: Encoded sequence (Length × 5 features)
    """
    encoding = [zscale_dict.get(aa, [0, 0, 0, 0, 0]) for aa in sequence]  # Use [0,0,0,0,0] for unknown AA
    return np.array(encoding)

# Example protein sequence
sequence = "ACD"

# Encode using Z-Scale
encoded_sequence = encode_sequence_zscale(sequence, Z_SCALES)

print("Encoded Sequence (Z-Scale Features):\n", encoded_sequence)


In [None]:
# Load dataset (ensure CSV has a 'sequence' column)
df = pd.read_csv("ExampleData.csv")
sequences = df['Sequence'].tolist()
# Apply Z-Scale encoding to all sequences
df["encoded_sequence"] = df["Sequence"].apply(lambda seq: encode_sequence_zscale(seq, Z_SCALES).tolist())

# Save encoded dataset
df.to_csv("zscale_encoded_dataset.csv", index=False)

In [None]:
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# storing sequences as lists 
df["encoded_sequence"] = df["encoded_sequence"].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Convert to a NumPy matrix
zscale_matrix = np.array(df["encoded_sequence"].tolist())

# Check matrix shape before plotting
print("Matrix Shape:", zscale_matrix.shape)  # Should be (num_sequences, sequence_length, 5)

# Select one Z-scale feature (e.g., Hydrophobicity, Z1)
zscale_feature_matrix = zscale_matrix[:, :, 0]  # Extract the first physicochemical property

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(zscale_feature_matrix, cmap="viridis", cbar=True, xticklabels=False, yticklabels=False)

plt.xlabel("Amino Acid Position")
plt.ylabel("Protein Sequences")
plt.title("Z-Scale Encoded Protein Sequences - Heatmap (Z1 Feature)")
plt.show()


In [None]:
# IGNORE

# Flatten the Z-Scale features for statistical analysis
zscale_data = np.array(df["encoded_sequence"].tolist())

# Calculate basic statistics
mean_values = np.mean(zscale_data, axis=(0, 1))  # Mean of each Z-Scale across all sequences
std_values = np.std(zscale_data, axis=(0, 1))    # Standard deviation

# Display the statistics
zscale_labels = ["Z1 (Hydrophobicity)", "Z2 (Size)", "Z3 (Polarity)", "Z4 (Electronic Effects)", "Z5 (Secondary Structure)"]
for label, mean, std in zip(zscale_labels, mean_values, std_values):
    print(f"{label}: Mean = {mean:.2f}, Std = {std:.2f}")