Niklas E. Siedhoff<sup><em>1,§</em></sup>, Alexander-Maurice Illig<sup><em>1,§</em></sup>, Ulrich Schwaneberg<sup><em>1,2</em></sup>, Mehdi D. Davari<sup><em>1,*</em></sup>, <br>
PyPEF – an Integrated Framework for Data-driven Protein Engineering, <em>Journal of Chemical Information and Modeling</em> (2021) <br>
<sup><em>1</em></sup><sub>Institute of Biotechnology, RWTH Aachen University, Worringer Weg 3, 52074 Aachen, Germany</sub> <br>
<sup><em>2</em></sup><sub>DWI-Leibniz Institute for Interactive Materials, Forckenbeckstraße 50, 52074 Aachen, Germany</sub> <br>
<sup><em>*</em></sup><sub>Corresponding author</sub> <br>
<sup><em>§</em></sup><sub>Equal contribution</sub> <br>

In [None]:
from Aminoacid_Index import Database

import numpy as np
import os
from tqdm import tqdm
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as clst
import plotly.graph_objects as go

# Generating clusters

In order to generate cluster the AAindex database using correlation coefficients, amino acid indices which do not contain information for all 20 natural amino acids have to be excluded.

In [None]:
accessions = [k for k, v in Database.items() if v.is_complete]
accessions_data = np.array([Database[accession].values for accession in accessions])

In [None]:
def Find_Cluster_Member(data, holdout_index, threshold):
    
    # first cluster member is houldout
    cluster_data = [data[holdout_index]]
    cluster_indices = [holdout_index]

    # iterate N times over dataset of length N
    for iteration in range(len(data)):
        
        # d is data of entry with index
        for index, d in enumerate(data):
            
            # ensure that entry is not member of cluster
            if index not in cluster_indices:
                
                # determine correlation of entry with each cluster member
                for d0 in cluster_data:
                    correlation = np.corrcoef(d0, d)[0,1]
                    
                    # if correlation is larger than threshold, add to cluster
                    if correlation > threshold:
                        cluster_indices.append(index)
                        cluster_data.append(d)
                        break

    directory_name = "cc_" + str(threshold)
    # save cluster data and indices of members in numpy array
    name = str(holdout_index) + '_data.npy'
    np.save(os.path.join(directory_name, name), cluster_data)

    name = str(holdout_index) + '_indices.npy'
    np.save(os.path.join(directory_name, name), cluster_indices)

    return cluster_indices


def Correlation_Clustering(data, threshold):
    
    # make directory for storing numpy arrays
    directory_name = "cc_" + str(threshold)
    try:
        os.mkdir(directory_name)
    except:
        pass
    
    number_of_complete_entries = data.shape[0]
    
    cluster_indices = []
    indices_assigned_to_cluster = []
    

    for holdout_index in tqdm(np.arange(number_of_complete_entries)):
        
        # check for every complete amino acid index, if it is already member of a cluster
        # if not, find cluster
        if holdout_index not in indices_assigned_to_cluster:
            clusters = Find_Cluster_Member(data, holdout_index, threshold)
            indices_assigned_to_cluster += clusters
            cluster_indices.append(clusters)
            
    # save indices of cluster member
    name = "cc_" + str(threshold) + "_results.npy"
    np.save(os.path.join(directory_name, name), clusters)

Do the clustering by specifying a threshold.

In [None]:
threshold=0.95

In [None]:
Correlation_Clustering(accessions_data, threshold)

# Analysis of the obtained clusters

In order to identify which properties have been clustered together, we look at the data description of the members of the cluster.

In [None]:
def Count_Words(words):
    counts = dict()

    for word in words:
        word = word.lower()
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
    return {k: v for k, v in sorted(counts.items(), key=lambda item: item[1], reverse=True)}

def Anaylze_Clusters(directory_name, accessions, minimum_population):    
    clusters = []
    for file in os.listdir(directory_name):
        if file.endswith("_indices.npy"):

            accessions_of_cluster_members = []
            data_description_of_cluster_members = []
            
            accession_indices = np.load(os.path.join(directory_name, file))
            
            if len(accession_indices) >= minimum_population:
                for accession_index in accession_indices:
                    accession = accessions[accession_index]
                    accessions_of_cluster_members.append(accession)

                    data_description = Database[accession].data_description
                    data_description_of_cluster_members.append(data_description)

                clusters.append([file, accessions_of_cluster_members, data_description_of_cluster_members])
            
    return clusters

def Find_Heading(directory_name, accessions, minimum_population=7):
    
    clusters = Anaylze_Clusters(directory_name, accessions, minimum_population)
    for file, _, descriptions in clusters:
        descripts = []
        for desc in descriptions:
            print(desc)
            descripts += desc.rstrip().split(' ')
        # print(file, Count_Words(descripts), 2*'\n')
        print(file, 2*'\n')

In [None]:
Find_Heading("cc_" + str(threshold), accessions, minimum_population=6)

In [None]:
headings = {"120":"Partition energies", "26":"Polarity", "32":"Size",
            "47":"Composition", "69":"Alpha helix", "71":"Beta structure", "66":"Surface area"}

In [None]:
def Rearange_Correlation_Matrix(correlation_matrix):
    
    distances = clst.distance.pdist(correlation_matrix)
    linkage = clst.linkage(distances, method='complete')
    threshold = 0.5*np.max(distances)
    indices_sorted = clst.fcluster(linkage, threshold, criterion='distance')
    indices = np.argsort(indices_sorted)

    return (correlation_matrix[indices, :][:, indices], indices)

def Visualize_Correlation_Matrix(data, indices, accessions, save=False):
    
    correlation_matrix, indices = Rearange_Correlation_Matrix(np.corrcoef(data))
    labels = np.array(accessions)[indices]
    
    cbarlabel = 'Correlation'
    
    ax = plt.gca()
    im = ax.imshow(correlation_matrix, cmap='Blues', vmin=-1, vmax=1)
    cbar = ax.figure.colorbar(im, ax=ax)
    
    cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom")
    
    ax.set_xticks(np.arange(correlation_matrix.shape[0]))
    ax.set_yticks(np.arange(correlation_matrix.shape[1]))

    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)

    ax.tick_params(top=True, bottom=False, labeltop=False, labelbottom=True)
    plt.setp(ax.get_xticklabels(), rotation=90, ha="right", rotation_mode="anchor")
    
    fig = plt.gcf()
    fig.set_size_inches(12, 10)
    if save:
        fig.savefig('Mat.png', dpi=500)
    plt.show()

In [None]:
directory_name = "cc_0.95"
for k,v in headings.items():
    data_file = os.path.join(directory_name, k +"_data.npy")
    data = np.load(data_file)
    
    indices_file = os.path.join(directory_name, k +"_indices.npy")
    indices = np.load(indices_file)
    
    print(v)
    Visualize_Correlation_Matrix(data, indices, np.array(accessions)[indices])

# Generating compatible textfiles

After generating the clusters, textfiles similiar to those from the AAindex database are produced.

In [None]:
def Apply_PCA(data):
    
    pca = PCA(n_components=1)
    X = pca.fit_transform(Normalizer().transform(data.T))
    
    return X.flatten()

def Generate_Text_File(cluster, headings, directory_name):
    
    idx = cluster[0].split('_')[0]
    file = os.path.join(directory_name, idx + '_data.npy')
    
    data = np.load(file)
    data_pca = Apply_PCA(data)
    
    rows = np.split(data_pca, 2)
    upper_borders = [7 + 8*i for i in range(10)]    

    heading = headings[idx]
    members = cluster[1]
    
    H = "H " + heading + "\n" 
    D = "D " + " ".join(members) + "\n"
    I = "I    A/L     R/K     N/M     D/F     C/P     Q/S     E/T     G/W     H/Y     I/V\n"
    
    filename = heading + ".txt"
    
    with open(filename, "w") as f:    
        f.write(H)
        f.write(D)
        f.write(I)
        
        for row in rows:
            line = [" " for i in range(len(I)-1)]
            line.append("\n")

            for val, border in zip(row, upper_borders):    
                val = "%.3f"%(val)

                for idx, char in enumerate(val[::-1]):
                    pos = border - idx
                    line[pos] = char

            f.write("".join(line))

In [None]:
clusters = Anaylze_Clusters("cc_" + str(threshold), accessions, minimum_population=6)

In [None]:
for cluster in clusters:
    Generate_Text_File(cluster, headings, directory_name)

In [None]:
x_string = list("ARNDCQEGHILKMFPSTWYV")

directory_name = "cc_0.95"
for k,v in headings.items():
    data_file = os.path.join(directory_name, k +"_data.npy")
    data = np.load(data_file)
    
    indices_file = os.path.join(directory_name, k +"_indices.npy")
    indices = np.load(indices_file)
    
    pca_data = Apply_PCA(data)
    argsort = np.argsort(pca_data)
    pca_data = pca_data[argsort]
    annotations = np.array(x_string)[argsort]
    
    x_top = []
    text_top = []
    
    x_bottom = []
    text_bottom = []
    for i, (val, ann) in enumerate(zip(pca_data, annotations)):
        if i%2:
            x_top.append(val)
            text_top.append(ann)
        else:
            x_bottom.append(val)
            text_bottom.append(ann)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x_top, y=np.zeros(10),
                             mode='markers+text', marker_size=10,
                             marker_symbol='triangle-up',
                             text=text_top, textposition="top center"
                            ))
    
    fig.add_trace(go.Scatter(x=x_bottom, y=np.zeros(10),
                         mode='markers+text', marker_size=10, marker_symbol='triangle-down',
                         text=text_bottom, textposition="bottom center"
                        ))
    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=False, 
                     zeroline=True, zerolinecolor='black', zerolinewidth=3,
                     showticklabels=False)
    fig.update_layout(height=250, plot_bgcolor='white', title=v, xaxis_title="PC 1")
    fig.show()
    
    plt.figure()
    plt.scatter(annotations, pca_data)
    plt.plot(annotations, pca_data, linestyle='--')
    plt.xlabel('Amino acid')
    plt.ylabel('PC 1')
    plt.title(v)
    plt.show()

In [None]:
clusters = Anaylze_Clusters("cc_" + str(threshold), accessions, minimum_population=6)
Xlabel = []
X = []
for i, cluster in enumerate(clusters):
    for accession_number in cluster[1]:
        vals = Database[accession_number].values
        X.append((np.array(vals) - np.mean(vals))/np.std(vals, ddof=1))
        Xlabel.append(i)

In [None]:
pca = PCA(n_components=2)
X = np.array(X)
X_pca = pca.fit_transform(Normalizer().transform(X))

In [None]:
X_pca_structured = []

i0 = 0
temp = []
for i, x in zip(Xlabel, X_pca):
    if i == i0:
        temp.append(x)
    else:
        X_pca_structured.append(np.array(temp))
        temp = [x]
        i0 = i
# X_pca_structured = np.array(X_pca_structured)

In [None]:
plt.rc('font', size=12)
plt.rc('axes', titlesize=14)
plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rc('legend', fontsize=11)


fig = plt.figure(figsize=(6,5))
for x, h in zip(X_pca_structured, headings.values()):
    plt.scatter(x[:,0], x[:,1], label=h, s=25, edgecolor='black', linewidth=0.4)
# plt.legend(ncol=2, bbox_to_anchor=[0.5, -0.35], loc='center')
plt.legend(ncol=1, loc=1)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.ylim(-0.9, 1.2)
plt.show()
fig.tight_layout()
fig.savefig('Clustering.png', dpi=500)

In [None]:
clusters

**Done!**