# continguityScore

We have the data file giving the mean similarity (and counter similarity) for each atom in each protein in our dataset.

Here we try to extract the signal from the noise.

We have options:
- Average the score over nearby atoms, using a exponentially decaying weight
- Count the number of nearby atoms with a non-zero count (only works for the count)
- Count, spatially weighting

In [None]:
import h5py
import quippy
import json
import numpy as np
import ase.io
import os
import collections 
import subprocess
import matplotlib.pyplot as plt

import scipy.spatial
import pandas as pd
import seaborn as sns

In [None]:
path_to_hdf5_file="./reparsed/r10xi2sigma1/reparsed_proteinstructures.xyz.hdf5"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value

In [None]:
def generateAverageAndCounterSimilarityAnyHetATM(path_to_matrices, path_to_xyz_file, labels, index):
    # Ignore ligandid, look at the distance to any binder
    n_most_common = 10
    similarities = dict()
    label = labels[index][1]
    print(label)
    hemes = []
    nucleos = []
    heme_indices = []
    nucleo_indices = []

    for i, tag in enumerate(labels):
        # The similarity between each atom of the test protein and the target protein
        atomic_similarity = np.load("{0}/{1}_{2}_kernsandperms.npy".format(path_to_matrices,index,i))
        indices = np.argsort(atomic_similarity)
        tag = tag[1]
        if "heme" in tag:
            hemes.append(atomic_similarity)
            heme_indices.append(indices)
        else:
            nucleos.append(atomic_similarity)
            nucleo_indices.append(indices)

    hemes = np.asarray(hemes)
    nucleos = np.asarray(nucleos)
    heme_means = np.mean(hemes, axis=0)
    nucleo_means = np.mean(nucleos, axis=0)

    # scale to 1, as otherwise the b-factors round out
    heme_means /= heme_means.max()
    nucleo_means /= nucleo_means.max()

    atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    else:
        print("Protein not found!")
        raise ValueError

    # Check that the arrays are the correct length, and referencing the correct protein
    assert protein.params["tag"] == label
    assert len(heme_means) == len(protein.arrays["positions"])
    assert len(nucleo_means) == len(protein.arrays["positions"])


    counter = collections.Counter()
    for row in heme_indices:
        counter.update(row[-n_most_common:])

    heme_counts = np.zeros(len(protein.arrays["positions"]))
    for index, count in counter.most_common():
        heme_counts[index] = count

    counter = collections.Counter()
    for row in nucleo_indices:
        counter.update(row[-n_most_common:])

    nucleo_counts = np.zeros(len(protein.arrays["positions"]))
    for index, count in counter.most_common():
        nucleo_counts[index] = count


    assert len(nucleo_counts) == len(nucleo_means)
    assert len(heme_counts) == len(heme_means)

    # We have the count-based and average-based similarity scores.
    # Now we need to obtain each atom's distance from the ligand.

    # Read the positions for the test protein 
    atomic_positions = protein.arrays["positions"]

    # Get the ligand atoms (these aren't in the xyz file, as they're stripped in preprocessing)
    # labels take the form PDBREFCHAINREF_heme or PDBREFCHAINREF_nucleo
    pdb_ref = label[:4]
    chain_ref = label[4]
    protein_type = label[6:]
    if not os.path.exists("{}.pdb".format(pdb_ref)):
        subprocess.call(["wget","http://www.rcsb.org/pdb/files/{}.pdb".format(pdb_ref)])
    try:
        with open("{}.pdb".format(pdb_ref)) as flines:
            pdb_data = flines.readlines()
    except IOError:
        print("IO Error")

    ligand_data = []
    for line in pdb_data:
        if line.startswith("HETATM"):
            if line[17:20].strip() != "HOH" and line[21] == chain_ref:
                ligand_data.append(line.strip())

    if not ligand_data:
        print("No ligand found")

    with open("temp.pdb", "w") as outflines:
        outflines.write("\n".join(ligand_data))
    ligand = ase.io.read("temp.pdb", format="proteindatabank")

    subprocess.call(["rm", "{}.pdb".format(pdb_ref)])
    subprocess.call(["rm", "temp.pdb"])    


    ligand_centroid = np.mean(ligand.arrays["positions"], axis=0)
    distance_from_centroid = np.linalg.norm(atomic_positions-ligand_centroid,axis=1)
#     plt.scatter(distance_from_centroid, bfactors)
#     plt.title("Distance from centroid")
# #     plt.ylim([1.0,1.2])
#     plt.show()

    # get the distance to the nearest ligand atom instead
    distance_from_nearest_ligand_atom = []
    for position in atomic_positions:
        distance_to_ligand = np.min(np.linalg.norm(position-ligand.arrays["positions"],axis=1))
        distance_from_nearest_ligand_atom.append(distance_to_ligand)
#     plt.title("Distance to closest atom")
# #     plt.ylim([1.0,1.2])
#     plt.scatter(distance_from_nearest_ligand_atom, bfactors)
#     plt.show()
    output_dict = {"heme_means": list(heme_means),
                  "nucleo_means": list(nucleo_means),
                  "nucleo_counts": list(nucleo_counts),
                  "heme_counts": list(heme_counts),
                  "distance_from_nearest_ligand_atom": list(distance_from_nearest_ligand_atom),
                  "distance_from_centroid": list(distance_from_centroid)}
    return output_dict

### Probably k= 10 makes the most physical sense. But lets add the spatial weighting first

In [None]:
def get_spatial_average(distance_matrix, score_vector, sigma=10, k=1000):
    
    # argsort each row to get a list of which points are closest for each atom
    closest = np.argsort(distance_matrix, axis=1)
    
    
    k_nearest_atoms = closest[:, 0:k]

    spatial_averages = []

    for i,row in enumerate(k_nearest_atoms):
        distance_between_atoms = distance_matrix[i][row]
        spatial_average = []
        weights = []
        for atom in row:
            distance = distance_matrix[i][atom]
            score = score_vector[atom]  # could stick a "bool" here
            weight = np.exp(-distance/sigma)
            spatial_average.append(weight*score)
            weights.append(weight)
        spatial_average = np.sum(spatial_average)/np.sum(weights)
        spatial_averages.append(spatial_average)
        
    assert len(spatial_averages) == len(score_vector)
    return spatial_averages


In [None]:
def setBFactorAndOutputPDBFile(atomObject, similarities, pdbPath):
    """
    Given an Atoms object and the similarities for each atom, set the b-factor for the Atoms as the similarities then
    output the Atoms as a PDB for PyMoL viz (which I'll need to do locally)
    """
    assert len(atomObject.arrays["positions"] == len(similarities))
    atomObject.arrays["bfactor"] = np.asarray(similarities)
    ase.io.write(pdbPath, atomObject, format="proteindatabank")
    

In [None]:
# Plot the new spatially averaged stuff on the proteins
# Plot the average and std for each class, for the third protein
def setBFactorsGivenDAT(path_to_xyz_file, path_to_dat, labels):
    
    atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")
    # Crude, but quick enough for now
    for protein in atom_reader:
        label = protein.params["tag"]
        print(label)
        try:
            with open("{}/{}.dat".format(path_to_dat, label)) as flines:
                data = [line.strip().split() for line in flines]
                # strip header
                data = data[1:]
                heme_means = [float(x[0]) for x in data]               
                nucleo_means = [float(x[1]) for x in data]               
                heme_counts = [float(x[2]) for x in data]               
                nucleo_counts = [float(x[3]) for x in data]               

        except IOError:
            continue

        assert len(heme_means) == len(protein.arrays["positions"])
        assert len(nucleo_means) == len(protein.arrays["positions"])
        
        setBFactorAndOutputPDBFile(protein, heme_means, "{}/{}_heme_means.pdb".format(path_to_dat,label))
        setBFactorAndOutputPDBFile(protein, nucleo_means, "{}/{}_nucleo_means.pdb".format(path_to_dat,label))        
        setBFactorAndOutputPDBFile(protein, heme_counts, "{}/{}_heme_counts.pdb".format(path_to_dat,label))        
        setBFactorAndOutputPDBFile(protein, nucleo_counts, "{}/{}_nucleo_counts.pdb".format(path_to_dat,label))

In [None]:
def get_number_of_non_zero_counts(distance_matrix, count_vector, k=10):
    
    # argsort each row to get a list of which points are closest for each atom
    closest = np.argsort(distance_matrix, axis=1)

    k_nearest_atoms = closest[:, 0:k]

    non_zero_counts = []
    count_vector = np.asarray(count_vector)
    for i,row in enumerate(k_nearest_atoms):
        counts = np.count_nonzero(count_vector[row])
        non_zero_counts.append(counts)
        
    assert len(non_zero_counts) == len(count_vector)
    return non_zero_counts


In [None]:
def setBFactorsGivenDATNonZeroCounts(path_to_xyz_file, path_to_dat, labels):
    
    atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")
    # Crude, but quick enough for now
    for protein in atom_reader:
        label = protein.params["tag"]
        print(label)
        try:
            with open("{}/{}.dat".format(path_to_dat, label)) as flines:
                data = [line.strip().split() for line in flines]
                # strip header
                data = data[1:]
                heme_counts = [float(x[0]) for x in data]               
                nucleo_counts = [float(x[1]) for x in data]               

        except IOError:
            continue

        assert len(heme_counts) == len(protein.arrays["positions"])
        assert len(nucleo_counts) == len(protein.arrays["positions"])
        
        setBFactorAndOutputPDBFile(protein, heme_counts, "{}/{}_heme_counts.pdb".format(path_to_dat,label))        
        setBFactorAndOutputPDBFile(protein, nucleo_counts, "{}/{}_nucleo_counts.pdb".format(path_to_dat,label))

In [None]:
path_to_hdf5_file = "./reparsed/r10xi2sigma1/reparsed_proteinstructures.xyz.hdf5"
path_to_xyz_file = "./reparsed/r10xi2sigma1/reparsed_proteinstructures.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels[:3]):
    label = label[1]
    output_dict = generateAverageAndCounterSimilarityAnyHetATM("./reparsed/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)

    heme_means = output_dict["heme_means"]
    nucleo_means = output_dict["nucleo_means"]
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))

    spatial_average_heme_counts = get_spatial_average(D, heme_counts, sigma=10)
    spatial_average_nucleo_counts = get_spatial_average(D, nucleo_counts, sigma=10)
    spatial_average_heme_means = get_spatial_average(D, heme_means, sigma=10)
    spatial_average_nucleo_means = get_spatial_average(D, nucleo_means, sigma=10)
    
    hemes = [[x,y, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_heme_counts)]
    nucleos = [[x,y, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])

    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df)
    plt.show()
    #     headers = ["heme_means_averaged", "nucleo_means_averaged", "heme_counts_averaged", "nucleo_counts_averaged",
#               "nearest_distance", "centroid_distance"]
#     output_data = zip(spatial_average_heme_means, spatial_average_nucleo_means, spatial_average_heme_counts, spatial_average_nucleo_counts,
#                        distance_from_nearest_ligand_atom, distance_from_centroid)

#     if not os.path.exists("outputs"):
#         os.mkdir("outputs")
#     with open("outputs/{}.dat".format(label), "w") as flines:
#         flines.write(" ".join(headers) + "\n")
#         for line in output_data:
#             line = [round(x,4) for x in line]
#             flines.write(" ".join(map(str,line)) + "\n")


In [None]:
path_to_hdf5_file = "./filtered/r10xi2sigma1/samples_filtered.xyz.hdf5"
path_to_xyz_file = "./filtered/r10xi2sigma1/samples_filtered.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels[:3]):
    label = label[1]
    output_dict = generateAverageAndCounterSimilarityAnyHetATM("./filtered/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)

    
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))
    non_zero_heme_counts = get_number_of_non_zero_counts(D, heme_counts, k=10)
    non_zero_nucleo_counts = get_number_of_non_zero_counts(D, nucleo_counts, k=10)

    hemes = [[x,y+np.random.random()*0.05, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, non_zero_heme_counts)]
    nucleos = [[x,y+np.random.random()*0.05, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, non_zero_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])
    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df, alpha=0.5, y_jitter=True)
    plt.savefig("{}.png".format(label))
    plt.show()

    

In [None]:
path_to_hdf5_file = "./filtered/r10xi2sigma1/samples_filtered.xyz.hdf5"
path_to_xyz_file = "./filtered/r10xi2sigma1/samples_filtered.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels):
    label = label[1]
    output_dict = generateAverageAndCounterSimilarityAnyHetATM("./filtered/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)

    heme_means = output_dict["heme_means"]
    nucleo_means = output_dict["nucleo_means"]
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))

    spatial_average_heme_counts = get_spatial_average(D, heme_counts, sigma=10)
    spatial_average_nucleo_counts = get_spatial_average(D, nucleo_counts, sigma=10)
    spatial_average_heme_means = get_spatial_average(D, heme_means, sigma=10)
    spatial_average_nucleo_means = get_spatial_average(D, nucleo_means, sigma=10)
    
    hemes = [[x,y, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_heme_counts)]
    nucleos = [[x,y, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])

    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df)
    plt.show()
    headers = ["heme_means_averaged", "nucleo_means_averaged", "heme_counts_averaged", "nucleo_counts_averaged",
              "nearest_distance", "centroid_distance"]
    output_data = zip(spatial_average_heme_means, spatial_average_nucleo_means, spatial_average_heme_counts, spatial_average_nucleo_counts,
                       distance_from_nearest_ligand_atom, distance_from_centroid)

    if not os.path.exists("outputs"):
        os.mkdir("outputs")
    with open("outputs/{}.dat".format(label), "w") as flines:
        flines.write(" ".join(headers) + "\n")
        for line in output_data:
            line = [round(x,4) for x in line]
            flines.write(" ".join(map(str,line)) + "\n")

    setBFactorAndOutputPDBFile(protein, spatial_average_heme_means, "outputs/{}_heme_means.pdb".format(label))
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_means, "outputs/{}_nucleo_means.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_heme_counts, "outputs/{}_heme_counts.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_counts, "outputs/{}_nucleo_counts.pdb".format(label))


In [None]:
!ls outputs

In [None]:
!mv outputs outputs_filtered_spatial_averaging_k1000/

## 1000 is too much, it's all washed out

In [None]:
path_to_hdf5_file = "./filtered/r10xi2sigma1/samples_filtered.xyz.hdf5"
path_to_xyz_file = "./filtered/r10xi2sigma1/samples_filtered.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels):
    label = label[1]
    output_dict = generateAverageAndCounterSimilarityAnyHetATM("./filtered/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)

    heme_means = output_dict["heme_means"]
    nucleo_means = output_dict["nucleo_means"]
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))

    spatial_average_heme_counts = get_spatial_average(D, heme_counts, sigma=10, k=100)
    spatial_average_nucleo_counts = get_spatial_average(D, nucleo_counts, sigma=10, k=100)
    spatial_average_heme_means = get_spatial_average(D, heme_means, sigma=10, k=100)
    spatial_average_nucleo_means = get_spatial_average(D, nucleo_means, sigma=10, k=100)
    
    hemes = [[x,y, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_heme_counts)]
    nucleos = [[x,y, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])

    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df)
    plt.show()
    headers = ["heme_means_averaged", "nucleo_means_averaged", "heme_counts_averaged", "nucleo_counts_averaged",
              "nearest_distance", "centroid_distance"]
    output_data = zip(spatial_average_heme_means, spatial_average_nucleo_means, spatial_average_heme_counts, spatial_average_nucleo_counts,
                       distance_from_nearest_ligand_atom, distance_from_centroid)

    if not os.path.exists("outputs"):
        os.mkdir("outputs")
    with open("outputs/{}.dat".format(label), "w") as flines:
        flines.write(" ".join(headers) + "\n")
        for line in output_data:
            line = [round(x,4) for x in line]
            flines.write(" ".join(map(str,line)) + "\n")

    setBFactorAndOutputPDBFile(protein, spatial_average_heme_means, "outputs/{}_heme_means.pdb".format(label))
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_means, "outputs/{}_nucleo_means.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_heme_counts, "outputs/{}_heme_counts.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_counts, "outputs/{}_nucleo_counts.pdb".format(label))


In [None]:
!ls outputs/

In [None]:
!mv outputs outputs_filtered_spatial_averaging_k100/

### 100 is too much, it's all washed out

In [None]:
path_to_hdf5_file = "./filtered/r10xi2sigma1/samples_filtered.xyz.hdf5"
path_to_xyz_file = "./filtered/r10xi2sigma1/samples_filtered.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels):
    label = label[1]
    output_dict = generateAverageAndCounterSimilarityAnyHetATM("./filtered/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)

    heme_means = output_dict["heme_means"]
    nucleo_means = output_dict["nucleo_means"]
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))

    spatial_average_heme_counts = get_spatial_average(D, heme_counts, sigma=10, k=50)
    spatial_average_nucleo_counts = get_spatial_average(D, nucleo_counts, sigma=10, k=50)
    spatial_average_heme_means = get_spatial_average(D, heme_means, sigma=10, k=50)
    spatial_average_nucleo_means = get_spatial_average(D, nucleo_means, sigma=10, k=50)
    
    hemes = [[x,y, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_heme_counts)]
    nucleos = [[x,y, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])

    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df)
    plt.show()
    headers = ["heme_means_averaged", "nucleo_means_averaged", "heme_counts_averaged", "nucleo_counts_averaged",
              "nearest_distance", "centroid_distance"]
    output_data = zip(spatial_average_heme_means, spatial_average_nucleo_means, spatial_average_heme_counts, spatial_average_nucleo_counts,
                       distance_from_nearest_ligand_atom, distance_from_centroid)

    if not os.path.exists("outputs"):
        os.mkdir("outputs")
    with open("outputs/{}.dat".format(label), "w") as flines:
        flines.write(" ".join(headers) + "\n")
        for line in output_data:
            line = [round(x,4) for x in line]
            flines.write(" ".join(map(str,line)) + "\n")

    setBFactorAndOutputPDBFile(protein, spatial_average_heme_means, "outputs/{}_heme_means.pdb".format(label))
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_means, "outputs/{}_nucleo_means.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_heme_counts, "outputs/{}_heme_counts.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_counts, "outputs/{}_nucleo_counts.pdb".format(label))


In [None]:
def get_spatial_average_bool(distance_matrix, score_vector, sigma=10, k=1000):
    # Only makes sense for the "counts" - distance-weighted version of the non-zero counts
    
    # argsort each row to get a list of which points are closest for each atom
    closest = np.argsort(distance_matrix, axis=1)
    
    
    k_nearest_atoms = closest[:, 0:k]

    spatial_averages = []

    for i,row in enumerate(k_nearest_atoms):
        distance_between_atoms = distance_matrix[i][row]
        spatial_average = []
        weights = []
        for atom in row:
            distance = distance_matrix[i][atom]
            score = bool(score_vector[atom])
            weight = np.exp(-distance/sigma)
            spatial_average.append(weight*score)
            weights.append(weight)
        spatial_average = np.sum(spatial_average)/np.sum(weights)
        spatial_averages.append(spatial_average)
        
    assert len(spatial_averages) == len(score_vector)
    return spatial_averages


In [None]:
!mv outputs outputs_filtered_spatial_averaging_k50

In [None]:
path_to_hdf5_file = "./filtered/r10xi2sigma1/samples_filtered.xyz.hdf5"
path_to_xyz_file = "./filtered/r10xi2sigma1/samples_filtered.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels):
    label = label[1]
    output_dict = generateAverageAndCounterSimilarityAnyHetATM("./filtered/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)

    heme_means = output_dict["heme_means"]
    nucleo_means = output_dict["nucleo_means"]
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))

    spatial_average_heme_counts = get_spatial_average_bool(D, heme_counts, sigma=10, k=50)
    spatial_average_nucleo_counts = get_spatial_average_bool(D, nucleo_counts, sigma=10, k=50)
    spatial_average_heme_means = get_spatial_average_bool(D, heme_means, sigma=10, k=50)
    spatial_average_nucleo_means = get_spatial_average_bool(D, nucleo_means, sigma=10, k=50)
    
    hemes = [[x,y, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_heme_counts)]
    nucleos = [[x,y, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])

    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df)
    plt.show()
    headers = ["heme_means_averaged", "nucleo_means_averaged", "heme_counts_averaged", "nucleo_counts_averaged",
              "nearest_distance", "centroid_distance"]
    output_data = zip(spatial_average_heme_means, spatial_average_nucleo_means, spatial_average_heme_counts, spatial_average_nucleo_counts,
                       distance_from_nearest_ligand_atom, distance_from_centroid)

    if not os.path.exists("outputs"):
        os.mkdir("outputs")
    with open("outputs/{}.dat".format(label), "w") as flines:
        flines.write(" ".join(headers) + "\n")
        for line in output_data:
            line = [round(x,4) for x in line]
            flines.write(" ".join(map(str,line)) + "\n")

    setBFactorAndOutputPDBFile(protein, spatial_average_heme_means, "outputs/{}_heme_means.pdb".format(label))
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_means, "outputs/{}_nucleo_means.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_heme_counts, "outputs/{}_heme_counts.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_counts, "outputs/{}_nucleo_counts.pdb".format(label))


In [None]:
!mv outputs outputs_filtered_boolspatial_k50

# Re-done with new PDB files

We found that the old XYZ files were missing atoms

In [None]:
path_to_hdf5_file = "./reparsed/r10xi2sigma1/reparsed_proteinstructures.xyz.hdf5"
path_to_xyz_file = "./reparsed/r10xi2sigma1/reparsed_proteinstructures.xyz"
dataFile = h5py.File(path_to_hdf5_file, "r")
labels = dataFile["labels"]["label_mat"].value
atom_reader = quippy.AtomsReader(path_to_xyz_file, format="xyz")

for index, label in enumerate(labels):
    label = label[1]
    try:
        output_dict = generateAverageAndCounterSimilarityAnyHetATM("./reparsed/r10xi2sigma1/basematrices",
                                                               path_to_xyz_file,
                                                               labels,
                                                               index)
    except AssertionError:
        continue
    heme_means = output_dict["heme_means"]
    nucleo_means = output_dict["nucleo_means"]
    heme_counts = output_dict["heme_counts"]
    nucleo_counts = output_dict["nucleo_counts"]
    distance_from_nearest_ligand_atom = output_dict["distance_from_nearest_ligand_atom"]
    distance_from_centroid = output_dict["distance_from_centroid"]
    
    # Get the spatial average of all the above quantities
    # Crude, but quick enough for now
    for temp_protein in atom_reader:
        if temp_protein.params["tag"] == label:
            protein = temp_protein
            break
    positions = protein.arrays["positions"]

    D = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(positions))

    spatial_average_heme_counts = get_spatial_average(D, heme_counts, sigma=10, k=50)
    spatial_average_nucleo_counts = get_spatial_average(D, nucleo_counts, sigma=10, k=50)
    spatial_average_heme_means = get_spatial_average(D, heme_means, sigma=10, k=50)
    spatial_average_nucleo_means = get_spatial_average(D, nucleo_means, sigma=10, k=50)
    
    hemes = [[x,y, "hemes"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_heme_counts)]
    nucleos = [[x,y, "nucleos"] for x,y in zip(distance_from_nearest_ligand_atom, spatial_average_nucleo_counts)]
    df = pd.DataFrame(hemes+nucleos, columns=["distance", "similarity", "class"])

    fig, ax = plt.subplots(figsize=(10,10))
    g = sns.scatterplot(x="distance", y="similarity", hue="class", data=df)
    plt.show()
    headers = ["heme_means_averaged", "nucleo_means_averaged", "heme_counts_averaged", "nucleo_counts_averaged",
              "nearest_distance", "centroid_distance"]
    output_data = zip(spatial_average_heme_means, spatial_average_nucleo_means, spatial_average_heme_counts, spatial_average_nucleo_counts,
                       distance_from_nearest_ligand_atom, distance_from_centroid)

    if not os.path.exists("outputs"):
        os.mkdir("outputs")
    with open("outputs/{}.dat".format(label), "w") as flines:
        flines.write(" ".join(headers) + "\n")
        for line in output_data:
            line = [round(x,4) for x in line]
            flines.write(" ".join(map(str,line)) + "\n")

    setBFactorAndOutputPDBFile(protein, spatial_average_heme_means, "outputs/{}_heme_means.pdb".format(label))
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_means, "outputs/{}_nucleo_means.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_heme_counts, "outputs/{}_heme_counts.pdb".format(label))        
    setBFactorAndOutputPDBFile(protein, spatial_average_nucleo_counts, "outputs/{}_nucleo_counts.pdb".format(label))


!mv outputs outputs_reparsed_spatial_k50