In [None]:
import numpy as np
import math
import scipy.misc
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#from utils import *
import pandas as pd

# Molecule data
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys 
from rdkit.Chem.Fingerprints import FingerprintMols

## Data extraction
def data_path_parse()
    ## Path and parse

    paths = {}
    files = {}
    data_sets = {}
    tmp = []
    f =open('Path_and_File.txt','r')
    string = f.read()
    list_1 = string.split('##')


    [tmp.append(x.split('=')[0].rstrip()) for x in list_1[1].split('#')[1:]]
    [paths.update({i : []} )for i in tmp]
    [paths[x.split('=')[0].rstrip()].append(x.split('= ')[1].rstrip()) for x in list_1[1].split('#')[1:]]
    tmp=[]
    [tmp.append(x.split('=')[0].rstrip()) for x in list_1[2].split('#')[1:]]
    [files.update({i : []} )for i in tmp]
    [files[x.split('=')[0].rstrip()].append(x.split('= ')[1].rstrip()) for x in list_1[2].split('#')[1:]]
    tmp = []
    [tmp.append(x.split('=')[0].rstrip()) for x in list_1[3].split('#')[1:]]
    [data_sets.update({i : []} )for i in tmp]
    [data_sets[x.split('=')[0].rstrip()].append(x.split('= ')[1].rstrip()) for x in list_1[3].split('#')[1:]]
    f.close()
    return paths, files, data_sets

def morgan_fps(arg):
    mol, n = arg[0], arg[1]
    fps_morgan = AllChem.GetMorganFingerprintAsBitVect(mol, 6, nBits=n)
    fps_morgan = np.array(list(fps_morgan.ToBitString())).astype('float32')
    #print('Morgan', (sum(fps_morgan)/fps_size))
    return fps_morgan
    
def maccs_fps(mol):
    fps_maccs = MACCSkeys.GenMACCSKeys(mol)#, nBits =1024)
    return np.array(list(fps_maccs.ToBitString())).astype('float32')

def topological_fps(mol):
    fps_top = FingerprintMols.FingerprintMol(mol, minPath=1, 
                                                maxPath=7, 
                                                fpSize=fps_size,
                                                bitsPerHash=2,
                                                useHs=False,
                                                tgtDensity = 0.0,
                                                minSize = 128 
                                                )
    fps_top = np.array(list(fps_top.ToBitString())).astype('float32')
    #print('Top', (sum(fps_top)/fps_size))
    return fps_top
    
def process_data_np(ds_name = 'RO3_Test_data', pool_size = 4):
    

    
    data_dir = paths['Path to training set'][0]
    df = pd.read_csv(os.path.join(data_dir, ds_name + '.csv'))
    pool = ThreadPool(pool_size)
    fps_morgan, fps_maccs, fps_top, mol = [], [], [], []
    fps_dict = {}
    l = np.array(df.value, dtype = np.float32)

    lable_scale = np.arange(int(min(l)), int(max(l)),int((max(l)-min(l))/10))
    for i in range(len(lable_scale)-1):
        if(i < len(lable_scale)-2):
            l[np.logical_and(np.greater_equal(l, lable_scale[i]),np.less_equal(l, lable_scale[i+1]))] = i
        else:
            l[(np.greater_equal(l, lable_scale[i]))] = i
    labels = [np.zeros([10], dtype = np.float32) for i in range(len(l))]
    for i in range(len(l)):
        labels[i][int(l[i])] = 1
    n =[2048, 512]    
    mol = pool.map(Chem.MolFromSmiles, list(df.smiles))
    arg = [mol, n[0]]
    fps_morgan1 = pool.map(morgan_fps, arg)
    arg = [mol, n[1]]
    fps_morgan2 = pool.map(morgan_fps, arg)
    
    fps2 = np.array(fps_morgan2)
    fps2 = np.reshape(fps2, [len(mol), n[1]])
    fps2 =fps2 - (np.equal(fps2,np.zeros(np.shape(fps2))))
    
    fps_top = pool.map(topological_fps, mol)
    fps1 = np.array([fps_morgan1, fps_top], dtype = np.float32)
    fps1 = np.reshape(fps1,[len(mol),fps_size,2])
    fps1 =fps1 - (np.equal(fps1,np.zeros(np.shape(fps1)))) # from [0,1] -> [-1,1]
    
    maccs_fps = maccs_fps(mol)
    
    fps_dict['fps1_train_np'], fps_dict['fps1_test_np'], fps_dict['labels1_train_np'], fps_dict['labels1_test_np'] = train_test_split(fps1,
                                                                      labels,
                                                                      test_size=0.20,
                                                                      random_state=42)
    fps_dict['fps2_train_np'], fps_dict['fps2_test_np'], fps_dict['labels2_train_np'], fps_dict['labels2_test_np'] = train_test_split(fps2,
                                                                      labels,
                                                                      test_size=0.20,
                                                                      random_state=42)
    
    fps_dict['fps3_train_np'], fps_dict['fps3_test_np'], fps_dict['labels3_train_np'], fps_dict['labels3_test_np'] = train_test_split(maccs_fps,
                                                                      labels,
                                                                      test_size=0.20,
                                                                      random_state=42)
    
    return fps_dict

def batch_gen(fps, label, batch_size = 32, FLAG = True):   
    
    num_batch_epoch = math.floor(len(fps)/batch_size) 
    fps_batch_list = []
    label_batch_list = []
    if FLAG:
        batch_index = np.random.randint(0, len(label), (batch_size, num_batch_epoch))  
    else:
        batch_index = np.arange(batch_size*num_batch_epoch).reshape((batch_size,num_batch_epoch))

    for i in range(num_batch_epoch):
   
        tmp1, tmp2 = [], []
        tmp1 = [fps[x] for x in batch_index[:,i]]
        tmp2 = [label[x] for x in batch_index[:,i]]
        fps_batch_list.append(tmp1)
        label_batch_list.append(tmp2)
    batch={'fps': fps_batch_list, 'label': label_batch_list}
    return batch