In [None]:
"""
allToAllGLOSIM.ipynb
Given a list of pdbrefs and chainrefs, perform the community detection on the relevant files. 
Write out a directory of pdb fragments (that code already exists somewhere). 
^^ Do this outside the container, as proteinnetworks is in Python 3
Remove everything that isn't an alpha-C.
Write the whole whole to an xyz file.
Run glosim on the xyz file.
Run the similarities, perform hierarchical clustering
Check the match to SCOP.
"""
import quippy
import ase
import palettable
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ase.atoms import Atoms as AseAtoms
%matplotlib inline


In [None]:
# Get similarities for all test proteins
with open("scopFragments.txt") as flines:
    scopPaths = ["scopFragments/" +line.strip() for line in flines]
proteins = []
for proteinPath in scopPaths:
    proteins.append(quippy.Atoms(ase.io.read(proteinPath, format='proteindatabank')))


In [None]:
scopFamily = quippy.AtomsList(proteins)
scopFamily.write("scopFamily2.xyz")


In [None]:
!python /usr/local/src/glosim/glosim.py --kernel rematch -n 10 -l 10 -c 15 -g 1.5 --gamma 0.01 --np 4 /root/allToAllGLOSIMCheck/scopFamily2.xyz # Choose parameters carefully

In [None]:
def plotKernelMatrix(inputArray):
    data = np.array(inputArray,dtype=float)
    print(data)
    cmap = palettable.cubehelix.purple_16.mpl_colormap
    sns.heatmap(data, vmin=0, vmax=1,cmap=cmap)
    plt.show()
    


In [None]:
with open("scopFamily-n10-l10-c15.0-g1.5_rematch-0.01.k") as flines:
    glosimData = [line.strip() for line in flines][1:]

glosimData = np.asarray([line.split() for line in glosimData], dtype=float)
# strip nans
glosimData = np.nan_to_num(glosimData)
plotKernelMatrix(glosimData)


In [None]:
with open("scopFamily2-n10-l10-c15.0-g1.5_rematch-0.01.k") as flines:
    glosimData2 = [line.strip() for line in flines][1:]

glosimData2 = np.asarray([line.split() for line in glosimData2], dtype=float)
# strip nans
glosimData2 = np.nan_to_num(glosimData2)
plotKernelMatrix(glosimData2)


In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [None]:

g = sns.clustermap(glosimData2)

In [None]:
# print(glosimData) 
with open("scopFragments.txt") as flines:
    proteinPaths = [line.strip().split(".")[0] for line in flines]

df = pd.DataFrame(glosimData2, columns=proteinPaths)
dictSwap = {i: x for i,x in enumerate(proteinPaths)}
df.rename(index=dictSwap, inplace=True)
df.head()


In [None]:
g = sns.clustermap(df, yticklabels="auto", figsize=(15,15))

In [None]:
g.savefig("scopClustering.pdf", dpi=300)
g.savefig("scopClustering.png", dpi=300)

In [None]:
g.dendrogram_col.linkage

In [None]:
from scipy.cluster import hierarchy
row_linkage = hierarchy.linkage(df, method='average')


In [None]:
print(row_linkage)

In [None]:
sns.heatmap(row_linkage)

In [None]:
fcluster = hierarchy.fcluster(row_linkage, t=1)
np.set_printoptions(threshold=np.nan)
print(fcluster)

In [None]:
[print()]