# Import DLChem and relevant packages

In [1]:
'''
The following is the Master program which runs all routines
concerned with analyzing SchNet/QM9 data

NOTE that the program (especially the labelling part) is designed
for linux systems. It may work on Windows, but you have to ensure
that first part of the label stays the same ../../../

Examples are shown below for how to run the 1) routines of training
SchNet 2) run inputs on trained SchNet & extracting layers, 3) labelling
qm9 dataset per atom according to functional group it is in, 4) 
running pca, t-sne routines on data, and more
'''

from numpy import genfromtxt
import pandas as pd
import numpy as np

from schnetpack.datasets import QM9
from schnetpack import AtomsData

from DLChem import labeldataset

from DLChem import dataanalysis

from DLChem.utils import utils

# Load the dataset 

In [None]:
#Load the data you will be using

qm9data = QM9('../../data/datasets/QM9/qm9.db',download=False,
              remove_uncharacterized=True)

# Label F.G. with reference to an element

In [None]:
## how to use the labelling code
element = 'H'
dataset_name = 'qm9'
number_data = 2950

label_dir = '../../data/label%s/%s/' %(dataset_name, element)
label_name = '%slabel%s.csv' %(dataset_name,element+str(number_data))

labeldataset.label(qm9data,label_dir,label_name,number_data,element)


# Label file dictionary

In [None]:
label_file = '../../data/labelqm9/O/qm9labelO5000.csv'
element = 'O'
utils.get_labels(label_file,element)

# PCA on data

In [None]:
# how to use pca code
data_file_path = '../../data/schnet/all/repqm9energy10000-30all5000'
n_components = 30
scale_data = False

x_pca, eig, ev, cov = dataanalysis.pca(data_file_path,n_components,scale_data)


# t-SNE on data

In [None]:
# how to use t_sne code
data_file_path = '../../data/schnet/all/repqm9energy10000-30all5000'
dimension = 2
perp = 300

x_tsne = dataanalysis.tsne(data_file_path,dimension,perp)

In [None]:
from numpy import savetxt

savetxt(data_file_path+'tsne100.csv',x_tsne,delimiter=',')

# Plotting

In [None]:
element = 'all' 
colors, markers = utils.colmark(element)
dimension = 2
data_type = 'pca'

data_file_path = '../../data/schnet/%s/repqm9energy10000-30%s5000%s.csv' %(element,element,data_type)
print(data_file_path)
label_path = '../../data/labelqm9/%s/qm9label%s5000.csv' %(element,element)
image_dir = '../../data/labelqm9/%s/' %(element)
xlabel = '%s1' %(data_type)
ylabel = '%s2' %(data_type)
legend = False
col = True
mark = False

utils.plotwlabels(data_file_path,label_path,dimension,element,colors,markers,mark,col,image_dir,xlabel,ylabel,legend)


# Pearson 

In [None]:
# how to use pca code
data_file_path = '../../data/schnet/repqm9energy10000-30H5000'
n_components = 30
scale_data = False

x_pca, eig, ev, cov = dataanalysis.pca(data_file_path,n_components,scale_data)
print(eig)

In [None]:
from numpy import savetxt
#For scaled covariance matrix, you must use the scaled data! 
data_file = '../../data/schnet/repqm9energy10000-30H5000pca.csv'
ae_data_file = '../../data/schnet/repqm9energy10000-30H5000ae.csv'

element='H'
label_file = '../../data/labelqm9/H/qm9labelH5000.csv'
#neighbor = utils.extractneighborae(element,label_file)
#savetxt('../../data/labelqm9/H/neighbor.csv',neighbor,delimiter=',')
neighbor_data_file = '../../data/labelqm9/H/neighbor.csv'


pearson = dataanalysis.pearson(data_file,neighbor_data_file)


In [None]:

print(pearson)

In [None]:
from numpy import savetxt

#For scaled covariance matrix, you must use the scaled data! 
data_file = '../../data/schnet/repqm9energy10000-30H1000noae.csv'
data = genfromtxt(data_file,delimiter=',')

atomrefs = qm9data.get_atomref(QM9.U0)
print('U0 of hyrogen:', '{:.2f}'.format(atomrefs[QM9.U0][1][0]), 'eV')
print('U0 of carbon:', '{:.2f}'.format(atomrefs[QM9.U0][6][0]), 'eV')
print('U0 of oxygen:', '{:.2f}'.format(atomrefs[QM9.U0][8][0]), 'eV')


neighbor_ae_vector = np.zeros((len(data),1))
label_file = '../../data/labelqm9/H/qm9labelH1000.csv'
label_read = pd.read_csv(label_file, delimiter=',')
label = label_read['Target']

for i in range(len(data)):
    if 0 <= label[i] < 22:
        neighbor_ae_vector[i] = atomrefs[QM9.U0][6][0]
    if 22 <= label[i] < 29:
        neighbor_ae_vector[i] = atomrefs[QM9.U0][7][0]
    if 29 <= label[i] < 33:
        neighbor_ae_vector[i] = atomrefs[QM9.U0][8][0]

data = np.hstack((data,neighbor_ae_vector))
print(data)

savetxt('../../data/schnet/repqm9energy10000-30H1000nae.csv',data,delimiter=',')

In [None]:
# how to use pca code
data_file_path = '../../data/schnet/repqm9energy10000-30H1000nae'
n_components = 31 
scale_data = False

x_pca, eig, ev, cov = dataanalysis.pca(data_file_path,n_components,scale_data)



In [None]:
pearson = dataanalysis.pearson(cov,data)

In [None]:
print(pearson[0:,30])
