# generateAtomicSimilarityVector

Using the "onevsall" pipeline we can feed in an xyz file containing a set of proteins, and do a one-vs-all comparison of a given protein in that set.

We need to then generate the mean atomic similarities for that protein.

In [18]:
import quippy
import ase
import numpy as np
import h5py
import matplotlib.pyplot as plt
import os
import collections 

%matplotlib notebook

In [3]:
index = 150
dataFile = h5py.File("../samples2.xyz.hdf5", "r")
labels = dataFile["labels"]["label_mat"].value
label = labels[index][1]
print(label)    


3kexA_nucleo




In [4]:
def setBFactorAndOutputPDBFile(atomObject, similarities, pdbPath):
    """
    Given an Atoms object and the similarities for each atom, set the b-factor for the Atoms as the similarities then
    output the Atoms as a PDB for PyMoL viz (which I'll need to do locally)
    """
    assert len(atomObject.arrays["positions"] == len(similarities))
    atomObject.arrays["bfactor"] = np.asarray(similarities)
    ase.io.write(pdbPath, atomObject, format="proteindatabank")

In [11]:
# Plot the average and std for each class, for the third protein
def generatePDBfileGivingAverageSimilarity(index, file_root, xyz_file_name):
#     dataFile = h5py.File("{0}/{1}.hdf5".format(file_root, xyz_file_name), "r")
#     labels = dataFile["labels"]["label_mat"].value
#     label = labels[index][1]
#     print(label)
    # Get atomic similarity differences between the two classes
    hemes = []
    nucleos = []
    
    for i, tag in enumerate(labels):
        kerns = np.load("{0}/basematrices/{1}_{2}_kerns.npy".format(file_root,index,i))
        perms = np.load("{0}/basematrices/{1}_{2}_perms.npy".format(file_root,index,i))
        atomic_similarity = kerns*perms
        # The similarity between each atom of the test protein and the target protein 
        atomic_similarity = np.sum(atomic_similarity, axis=1)
        tag = tag[1]
        if "heme" in tag:
            hemes.append(atomic_similarity)
        else:
            nucleos.append(atomic_similarity)

    hemes = np.asarray(hemes)
    nucleos = np.asarray(nucleos)
    
    heme_means = np.mean(hemes, axis=0)
    nucleo_means = np.mean(nucleos, axis=0)
        
    atom_reader = quippy.AtomsReader(xyz_file_name, format="xyz")
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    else:
        print("Protein not found!")
        raise ValueError
    assert protein.params["tag"] == label
    
    assert len(heme_means) == len(protein.arrays["positions"])

    assert len(nucleo_means) == len(protein.arrays["positions"])

#     # I want to highlight the most similar atoms. So zero out the atoms which have similarity less than the top-20 value.
#     top20_threshold = sorted(heme_means)[-20]
#     for i, x in enumerate(heme_means):
#         if x < top20_threshold:
#             heme_means[i] = 0

#     top20_threshold = sorted(nucleo_means)[-20]
#     for i, x in enumerate(nucleo_means):
#         if x < top20_threshold:
#             nucleo_means[i] = 0
    # scale to 1, as otherwise the b-factors round out
    heme_means /= heme_means.max()
    nucleo_means /= nucleo_means.max()

    if not os.path.exists("{0}/averageSim".format(file_root)):
        os.mkdir("{0}/averageSim".format(file_root))
        
    setBFactorAndOutputPDBFile(protein, heme_means, "{0}/averageSim/{1}_vshemes.pdb".format(file_root,label))
    setBFactorAndOutputPDBFile(protein, nucleo_means, "{0}/averageSim/{1}_vsnucleos.pdb".format(file_root,label))
    
    

In [12]:
arrays = []
for i, tag in enumerate(labels):
    kerns = np.load("basematrices/{0}_{1}_kerns.npy".format(index,i))
    perms = np.load("basematrices/{0}_{1}_perms.npy".format(index,i))
    atomic_similarity = kerns*perms
    arrays.append(atomic_similarity)

IOError: [Errno 2] No such file or directory: 'basematrices/150_0_kerns.npy'

In [70]:
array_dict = dict(zip([x[1] for x in labels], arrays))
np.savez_compressed(file="150", **array_dict)

## Parameter Sweep


In [80]:
file_root = "xi3sigma05"
xyz_file_name = "samples2.xyz"
generatePDBfileGivingAverageSimilarity(index, file_root, xyz_file_name)

3kexA_nucleo


In [81]:
file_root = "xi3sigma1"
xyz_file_name = "samples2.xyz"
generatePDBfileGivingAverageSimilarity(index, file_root, xyz_file_name)

3kexA_nucleo


In [82]:
file_root = "xi3sigma1N5"
xyz_file_name = "samples2.xyz"
generatePDBfileGivingAverageSimilarity(index, file_root, xyz_file_name)

3kexA_nucleo


In [83]:
file_root = "xi5sigma1"
xyz_file_name = "samples2.xyz"
generatePDBfileGivingAverageSimilarity(index, file_root, xyz_file_name)

3kexA_nucleo


In [6]:
!pwd

/rds/user/wpg23/hpc-work/soapxxDir/compareAtomicSimilarities/onevsall


In [13]:
file_root = "r10xi5sigma1"
xyz_file_name = "samples2.xyz"
generatePDBfileGivingAverageSimilarity(index, file_root, xyz_file_name)

In [16]:
# This function is much, much too big, and does many different things.
def generatePDBfileGivingAverageAndCounterSimilarity(index, file_root, xyz_file_name):
#     dataFile = h5py.File("{0}/{1}.hdf5".format(file_root, xyz_file_name), "r")
#     labels = dataFile["labels"]["label_mat"].value
#     label = labels[index][1]
#     print(label)
    # Get atomic similarity differences between the two classes
    hemes = []
    nucleos = []
    heme_indices = []
    nucleo_indices = []

    for i, tag in enumerate(labels):
        kerns = np.load("{0}/basematrices/{1}_{2}_kerns.npy".format(file_root,index,i))
        perms = np.load("{0}/basematrices/{1}_{2}_perms.npy".format(file_root,index,i))
        atomic_similarity = kerns*perms
        # The similarity between each atom of the test protein and the target protein 
        atomic_similarity = np.sum(atomic_similarity, axis=1)
        indices = np.argsort(atomic_similarity)

        tag = tag[1]
        if "heme" in tag:
            hemes.append(atomic_similarity)
            heme_indices.append(indices)
        else:
            nucleos.append(atomic_similarity)
            nucleo_indices.append(indices)


    hemes = np.asarray(hemes)
    nucleos = np.asarray(nucleos)
    
    heme_means = np.mean(hemes, axis=0)
    nucleo_means = np.mean(nucleos, axis=0)
        
    atom_reader = quippy.AtomsReader(xyz_file_name, format="xyz")
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    else:
        print("Protein not found!")
        raise ValueError
    assert protein.params["tag"] == label
    
    assert len(heme_means) == len(protein.arrays["positions"])
    assert len(nucleo_means) == len(protein.arrays["positions"])

#     # I want to highlight the most similar atoms. So zero out the atoms which have similarity less than the top-20 value.
#     top20_threshold = sorted(heme_means)[-20]
#     for i, x in enumerate(heme_means):
#         if x < top20_threshold:
#             heme_means[i] = 0

#     top20_threshold = sorted(nucleo_means)[-20]
#     for i, x in enumerate(nucleo_means):
#         if x < top20_threshold:
#             nucleo_means[i] = 0
    # scale to 1, as otherwise the b-factors round out
    heme_means /= heme_means.max()
    nucleo_means /= nucleo_means.max()

    if not os.path.exists("{0}/averageSim".format(file_root)):
        os.mkdir("{0}/averageSim".format(file_root))
        
    setBFactorAndOutputPDBFile(protein, heme_means, "{0}/averageSim/{1}_vshemes.pdb".format(file_root,label))
    setBFactorAndOutputPDBFile(protein, nucleo_means, "{0}/averageSim/{1}_vsnucleos.pdb".format(file_root,label))
    
      
    counter = collections.Counter()
    for row in heme_indices:
        counter.update(row[-10:])

    counter_heme_similarities = np.zeros(len(protein.arrays["positions"]))
    for index, count in counter.most_common():
        counter_heme_similarities[index] = count

    counter = collections.Counter()
    for row in nucleo_indices:
        counter.update(row[-10:])

    counter_nucleo_similarities = np.zeros(len(protein.arrays["positions"]))
    for index, count in counter.most_common():
        counter_nucleo_similarities[index] = count
    
    if not os.path.exists("{0}/counterSim".format(file_root)):
        os.mkdir("{0}/counterSim".format(file_root))

    setBFactorAndOutputPDBFile(protein, counter_heme_similarities, "{0}/counterSim/{1}_vshemes.pdb".format(file_root,label))
    setBFactorAndOutputPDBFile(protein, counter_nucleo_similarities, "{0}/counterSim/{1}_vsnucleos.pdb".format(file_root,label))
    
    

In [19]:
file_root = "r10xi5sigma1"
xyz_file_name = "samples2.xyz"
generatePDBfileGivingAverageAndCounterSimilarity(index, file_root, xyz_file_name)