In [None]:
# # import json
# import proteinnetworks
# import numpy as np
# # import os
# import pandas as pd
# # import palettable
# from scipy.cluster import hierarchy
# from collections import defaultdict
# from matplotlib.colors import rgb2hex, colorConverter

import sklearn.metrics
from sklearn import svm


# %matplotlib inline

# import subprocess
# import os
# from IPython.display import Image
# from IPython.display import display

import h5py
import numpy as np
import pandas as pd
import seaborn as sns

from palettable.colorbrewer.qualitative import Dark2_6


In [None]:
db = proteinnetworks.database.Database()

# SOAPXX

Now repeat this analysis with the SOAPXX platform

In [None]:

# tempDirectoryName = "tempScop"

hdf5_path = "./sampledDataSet/output/samples_residues.hdf5"


f = h5py.File(hdf5_path, "r")

kernels = f['kernel']['kernel_mat'].value

labels = f["labels"]["label_mat"].value
f.close()

In [None]:
labels = [x[1].decode() for x in labels]

In [None]:
kernel_df = pd.DataFrame(kernels, columns=labels)
kernel_df.head()

In [None]:
# Plot the scop classes as colourrows
palette = Dark2_6.hex_colors
colours = []
for label in kernel_df.columns:
    if label.endswith("heme"):
        colours.append(palette[0])
    elif label.endswith("nucleo"):
        colours.append(palette[1])
    else:
        raise ValueError

In [None]:
g = sns.clustermap(kernel_df, yticklabels="auto", figsize=(15,15), row_colors=colours)
g.savefig("bindingsites.png", dpi=300)
# scopg.savefig("AandBSCOPClasses.png", dpi=300)


# SVM stuff

- Run SVM using this kernel
- Extract the relevant params from the SVM (what they?)
- Get the pij and kij from SOAPXX

Each atom's contribution to the classifer is given by:

\begin{equation*}
\delta_{Z_J, B} = \sum_{A} \alpha_{A} y_{A}  \sum_{i \in A}  P_{ij} k_{ij}(A,B) + \frac{\beta}{|B|}
\end{equation*}

This is the contribution of an individual atomic environment j in structure B to the decision.

- $ \alpha_{A} y_{A}$ are the SVM coefficients, optimised using sklearn.
- $\beta$ is the decision threshold.
- These are extracted from the SVM classifier


- $P_{ij}$ is the permutation matrix mapping enviromments in A to environments in B.
- $k_{ij}(A,B)$ is the SOAP kernel between atomic environments $i \in A$ and $j \in B$
- These are extracted from SOAPXX




In [None]:
def shuffledCopies(a, b):
    """Shuffled a and b, where a is 1d and b is 2d symmetric"""
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    shuffled = b[p]
    b = shuffled[:,p]
    return a[p], b

In [None]:
labels = ["heme" if x == palette[0] else "nucleo" for x in colours]


In [None]:
kernelLabels, kernelData = shuffledCopies(np.asarray(labels), kernels)
# kernelLabels, kernelData = (np.asarray(scopClassLabels), soapxxScopData)


In [None]:
trainSize = int(len(kernelData)*2/3)
x_train = kernelData[:trainSize, :trainSize]
x_test = kernelData[trainSize:, :trainSize]
y_train = kernelLabels[:trainSize]
y_test = kernelLabels[trainSize:]
clf = svm.SVC(kernel="precomputed", verbose=False, max_iter=1e9, C=1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
scores = sklearn.metrics.classification_report(y_test, y_pred) #  output_dict=True)
print(scores)


In [None]:
def getAtomicContributionGivenAtomAndStructure(clf, atomIndex, structureIndex):
    """
    Given a atom index j and a structure index B, get the contribution to the  decision function.
    
    \delta_{Z_J, B} = \sum_{A} \alpha_{A} y_{A}  \sum_{i \in A}  P_{ij} k_{ij}(A,B) + \frac{\beta}{|B|}
    """
    
    dz = 0.0
    # Loop over structures A 
    for dualcoef, supportIndex in zip(clf.dual_coef_[0], clf.support_):
        # Which way round should these indices be?
        p_base = np.load(f"./soapxxSVM/dataBackup3/tempScop_tagged_perms_{structureIndex}_{supportIndex}.dat.npy")
        k_base = np.load(f"./soapxxSVM/dataBackup3/tempScop_tagged_kerns_{structureIndex}_{supportIndex}.dat.npy")
        # Loop over each atom i in structure A       
        p_base_row = p_base[atomIndex] # should be a row of length (atoms in supportIndex)
        k_base_row = k_base[atomIndex] # should be a row of length (atoms in supportIndex)
        dz += dualcoef*np.sum(p_base_row*k_base_row)
        
    sizeOfB = len(p_base) 
    dz += clf.intercept_ /sizeOfB
    return dz


# Pymol Viz


This all needs pipelining properly, but now we have the dzs giving the contribution of each residue to the SVM. 

Carl makes a density field $ \rho_B(r) = \sum_{j \in B} \delta_{z_j, B} N \left( r_j, \sigma_j \right)$, i.e a bunch of atom-centred Gaussians of width $\sigma = 0.5 A$. 

I think I'll just colour residues using the b-factor then spectrum it.

In [None]:
with open(f"tempScop/{scopPaths[10]}.pdb") as flines:
    data = flines.readlines()
    
residueSequenceNumbers = []
for line in data:
    residueSequenceNumbers.append(line[22:26].strip())

In [None]:
def plotAtomicContributionsGivenDeltaZMappingAndPDBRef(deltaZmapping, pdbRef, chainRef):
    """
    Given the PDB reference, chain reference and a list of (residueNumber, deltaZ) tuples,
    Plot the deltaZs onto the structure.
    """
    PDBData = db.extractPDBFile(pdbRef)

    data = np.asarray([line.strip() for line in PDBData
                       if line[:4] == "ATOM"
                       and line[21] == chainRef
                      ])
    with open("temp.pdb", "w") as flines:
        flines.write("\n".join(data))

    dzs = [x[1] for x in deltaZmapping]
    
    # Make the spectrum symmetric
    lowestValue = min(x[0] for x in dzs)
    highestValue = max(x[0] for x in dzs)
    if lowestValue < 0 and highestValue > 0:
        if abs(lowestValue) > highestValue:
            highestValue = - lowestValue
        elif abs(lowestValue) < highestValue:
            lowestValue = -highestValue    

    pymolScript = f"load temp.pdb, {pdbRef}\n"
    pymolScript += f"alter {pdbRef}, b=-1\n"

    for resi,dz in deltaZmapping: # might not work if the residue ids are off
        pymolScript += f"alter resi {resi}, b={dz[0]}\n"

    pymolScript += f"""
    #formatting
    bg_color white
    hide all
    #show sticks
    show cartoon
    spectrum b, blue_white_red, minimum={lowestValue}, maximum={highestValue}
    set opaque_background=0
    set antialias = on
    set line_smooth = 1
    set depth_cue = 1
    set specular = 1
    set surface_quality = 1
    set stick_quality = 15
    set sphere_quality = 2
    set ray_trace_fog = 0.8
    set light = (-0.2,0,-1)

    set ray_shadows, 0
    set surface_mode, 1
    set cartoon_side_chain_helper,on
    zoom
    rebuild
    """
    pymolScript += f"save {pdbRef}.pse \n"
    pymolScript += f"""
    set ray_trace_mode = 1
    png {pdbRef}.png, width=10cm, dpi=300, ray=1
    """

    with open("temp.pml", mode='w') as flines:
        flines.write(pymolScript)

    # Run quietly
    subprocess.run(["pymol", "-c", "temp.pml"])
    os.remove("temp.pml")
    os.remove("temp.pdb")
    display(Image(f"{pdbRef}.png"))
#     os.remove(f"{pdbRef}.png")


In [None]:
def plotAtomicContributionsGivenIndexOnSCOPPaths(indexNumber):
    """
    Given an index to SCOPPaths (the list of what label corresponds to what row of the kernel matrix),
    find the atomic contributions, then plot them.
    """
    global scopPaths
    
    pdbRef, chainRef, *_ = scopPaths[indexNumber].split("_")
    print(pdbRef, chainRef)
    data = !grep {scopPaths[indexNumber]}  soapxxSVM/dataBackup3/tempScop_tagged.xyz -B 1
    sizeOfStructure = int(data[0])
    dzs= []
    for atomIndex in range(sizeOfStructure):
        dz = getAtomicContributionGivenAtomAndStructure(clf, atomIndex,indexNumber)
        dzs.append(dz)
    with open(f"tempScop/{scopPaths[indexNumber]}.pdb") as flines:
        data = flines.readlines()
    residueSequenceNumbers = []
    for line in data:
        residueSequenceNumbers.append(line[22:26].strip())
        
        
    plotAtomicContributionsGivenDeltaZMappingAndPDBRef(list(zip(residueSequenceNumbers,dzs)), pdbRef, chainRef)


In [None]:
plotAtomicContributionsGivenIndexOnSCOPPaths(10)

In [None]:
plotAtomicContributionsGivenIndexOnSCOPPaths(1)

In [None]:
for i in range(len(scopPaths)):
    plotAtomicContributionsGivenIndexOnSCOPPaths(i)

In [None]:
for i, path in enumerate((scopPaths)):
    pdbRef, chainRef, *_ = scopPaths[i].split("_")
    print(f"<h2>pdbRef: {pdbRef}, chainRef: {chainRef} </h2>")
    print(f"<img src=\"svmPngs/{pdbRef}.png\">")
    print()
