In [3]:
"""
Use the level0, level1, level2 of the CATH hierarchy data to explore
how the SOAP descriptors change for different functional groupings
"""

%matplotlib inline
import requests
import ase
import quippy
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def plotKernelMatrix(inputArray):
    import palettable
    import numpy as np
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    data = np.array(inputArray,dtype=float)
#     print(data)
    cmap = palettable.cubehelix.purple_16.mpl_colormap
    sns.heatmap(data, vmin=0, vmax=1,cmap=cmap)
    plt.show()

In [4]:
with open("testproteins/level1-n8-l6-c5.0-g0.5_rematch-1.0.k") as flines:
    data = [line.strip() for line in flines][1:]

data = [line.split() for line in data]
plotKernelMatrix(data)

In [22]:
# quickly rattle off the level 0 values
with open("testproteins/level1.dat") as flines:
    inputProteins = eval(flines.read())
    print(inputProteins)

listOfAtoms = []
for proteinId in inputProteins:
    # Pull the pdb file
    pdb = proteinId[:4]
    chain = proteinId[4]
    print(pdb)
    url = "http://www.rcsb.org/pdb/files/{}.pdb".format(pdb)
    data = requests.get(url).text.split("\n")
    newData = []
    # Trim so it's single-chain
    for line in data:
        if line[:4] != "ATOM" or (line[:4] == "ATOM" and line[21] == chain):
            newData.append(line)
    with open("temp.pdb", 'w') as outflines:
        outflines.write("\n".join(newData))        
    #Create an Atoms object
    protein = quippy.Atoms(ase.io.read("temp.pdb", format='proteindatabank'))
    listOfAtoms.append(protein)

# Make the AtomsList
listOfAtoms = quippy.AtomsList(listOfAtoms)
# Write an xyz file
listOfAtoms.write("temp.xyz")
# run glosim
!python /usr/local/src/glosim/glosim.py /root/temp.xyz --kernel rematch --prefix level0
# save output?

In [5]:
# How best to compare? Histogram?
with open("level0-n8-l6-c5.0-g0.5_rematch-1.0.k") as flines:
    data = [line.strip() for line in flines][1:]

data = [line.split() for line in data]
plotKernelMatrix(data)

In [6]:

def plotKernelMatrixValues(inputArray):
    # given a square kernel matrix, plot the histogram of its values
#     import palettable

    
    sns.set()
    # Get the elements of the upper triangle as a 1d array
    data = np.array(inputArray,dtype=float)
    indicesOfUpperTriangle = np.triu_indices(len(data), k=1)
    listOfElements = data[indicesOfUpperTriangle]
    # plot with distplot
    ax = sns.kdeplot(listOfElements)
    ax.set_xlim([0,1])
    ax.set_xlabel("Similarity")
    return ax
    

In [7]:
with open("testproteins/level1-n8-l6-c5.0-g0.5_rematch-1.0.k") as flines:
    data = [line.strip() for line in flines][1:]

data = [line.split() for line in data]
ax = plotKernelMatrixValues(data)
ax.set_title("GLOSIM histogram level 1")
plt.show()

In [8]:
# strip infs
def stripInfs(inputArray):
    data = np.array(inputArray, dtype=float)
    for i, row in enumerate(data):
        if not np.isfinite(row[0]):
            data = np.delete(data, i, axis=0)
            data = np.delete(data, i, axis=1)
            
    return data

In [19]:

sns.set()
sns.set_context("poster")

fig, ax = plt.subplots(figsize=(10,10))

for i in [0,1]:
    with open("testproteins/level{}-n8-l6-c5.0-g0.5_rematch-1.0.k".format(i)) as flines:
        data = [line.strip() for line in flines][1:]
    data = [line.split() for line in data]
    data = stripInfs(data)
    # Get the elements of the upper triangle as a 1d array
    indicesOfUpperTriangle = np.triu_indices(len(data), k=1)
    listOfElements = data[indicesOfUpperTriangle]
    # plot with distplot
    sns.kdeplot(listOfElements, ax=ax, label="Level {}".format(i))

ax.set_xlim([0,1])
ax.set_xlabel("Similarity")
ax.set_title("GLOSIM similarity for two CATH levels")
plt.show()