## Physiochemical Property Based Encoding: AAIndex

Refer to paper: https://doi.org/10.1093/nar/28.1.374

Refer to github: https://github.com/amckenna41/aaindex?tab=readme-ov-file

Description of all the properties: https://www.genome.jp/aaindex/AAindex/list_of_indices

to install aaindex: !pip3 install aaindex --upgrade


In [None]:
from aaindex import aaindex1
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Load protein sequence dataset
protein_sequences_file = 'ExampleData.csv'  # Replace with your CSV file path
sequences_df = pd.read_csv(protein_sequences_file)

sequences = sequences_df['Sequence'].tolist()

# Defining the selected properties
selected_properties = ['CIDH920105', 'KYTJ820101', 'CHOP780201', 'GRAR740102', 'HOPT810101', 'ZIMJ680104', 'KARS160118', 'BUNA790103']

# Extracting the selected properties from AAindex1
aa_properties = {}
for prop in selected_properties:
    if prop in aaindex1.record_codes():
        aa_properties[prop] = aaindex1[prop].values

# Generating feature vectors for each protein sequence

encoded_features = []
for sequence in sequences:
    sequence_features = []
    for prop, prop_values in aa_properties.items():
        prop_vector = [prop_values.get(aa, 0) for aa in sequence]
        sequence_features.append(np.mean(prop_vector))  # Use mean of property values as feature
    encoded_features.append(sequence_features)

In [None]:
# to save sequences with features

feature_columns = [f"{prop}_mean" for prop in aa_properties.keys()]
encoded_df = pd.DataFrame(encoded_features, columns=feature_columns)

# Add the sequences to the DataFrame
encoded_df['sequence'] = sequences

# to keep 'sequence' as the first column
encoded_df = encoded_df[['sequence'] + feature_columns]

encoded_df.to_csv("aaindex_encoded.csv", index=False)

In [None]:
encoded_df

In [None]:
# Performing PCA 
pca = PCA(n_components=3)
pca_features = pca.fit_transform(encoded_features)

# DataFrame for PCA results
pca_df = pd.DataFrame(pca_features, columns=['PC1', 'PC2', 'PC3'])

#pca_df['PC1_norm'] = (pca_df['PC1'] - pca_df['PC1'].min()) / (pca_df['PC1'].max() - pca_df['PC1'].min())

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter plot in 3D for PC1, PC2, and PC3
sc = ax.scatter(pca_df['PC1'], pca_df['PC2'], pca_df['PC3'],  c=pca_df['PC3'], alpha=0.8, cmap='viridis')

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D PCA Scatter Plot of AAindex1 Properties')

plt.colorbar(sc, label='PCA Component Value')

plt.savefig("visualized-pca.png")
plt.show()


pca_df.to_csv('PCA_AAindex_Properties.csv', index=False)

In [None]:
pca_df

In [None]:
import seaborn as sns

In [None]:
# Get the PCA components and the selected properties
pca_components = pca.components_

# Create a DataFrame to visualize property contributions to each principal component
pca_loadings = pd.DataFrame(pca_components, columns=selected_properties, index=['PC1', 'PC2', 'PC3'])

print("PCA Loadings (Property contributions to each component):")
print(pca_loadings)


In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(pca_loadings, cmap='coolwarm', annot=True)
plt.title('PCA Loadings: Contribution of AAindex1 Properties to Principal Components')
plt.xlabel('Amino Acid Properties')
plt.ylabel('Principal Components')
plt.savefig("aaindex-pca-comp.png", bbox_inches = 'tight')
plt.show()


In [None]:
# Plot explained variance to choose components
explained_variance = pca.explained_variance_ratio_

plt.plot(range(1, len(explained_variance) + 1), np.cumsum(explained_variance), marker='o')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('PCA Explained Variance')
plt.grid(True)
plt.show()


In [None]:
# IGNORE here onwards!
# trial from the aanindex github

full_record = aaindex1['CHOP780206']   #get full AAI record
''' full_record ->
{'category': 'sec_struct', 'correlation_coefficients': {}, 
'description': 'Normalized frequency of N-terminal non helical region (Chou-Fasman, 1978b)', 'notes': '', 'pmid': '364941', 
'references': "Chou, P.Y. and Fasman, G.D. 'Prediction of the secondary structure of proteins from their amino acid sequence' Adv. Enzymol. 47, 45-148 (1978)", 'values': {'-': 0, 'A': 0.7, 'C': 0.65, 'D': 0.98, 'E': 1.04, 'F': 0.93, 'G': 1.41, 'H': 1.22, 'I': 0.78, 'K': 1.01, 'L': 0.85, 'M': 0.83, 'N': 1.42, 'P': 1.1, 'Q': 0.75, 'R': 0.34, 'S': 1.55, 'T': 1.09, 'V': 0.75, 'W': 0.62, 'Y': 0.99}}
'''

#get individual elements of AAindex record
record_values = aaindex1['CHOP780206']['values'] 
record_values = aaindex1['CHOP780206'].values

In [None]:
record_values

In [None]:
# to get description of a property

record_description = aaindex1['CHOP780206']['description']
record_description = aaindex1['CHOP780206'].description
#'description': 'Normalized frequency of N-terminal non helical region (Chou-Fasman, 1978b)'

record_description

In [None]:
aaindex1.num_records()

In [None]:
from aaindex import aaindex2

In [None]:
full_record = aaindex2['CHOP780206']

In [None]:
# Load AAIndex properties
aaindex = pyAAindex.load_aaindex()

# List all available AAIndex properties
available_properties = list(aaindex.keys())

# Print total number of properties and the first 10 as an example
print(f"Total AAIndex properties available: {len(available_properties)}")
print("First 10 properties:", available_properties[:10])
