In [86]:
import pandas as pd
import numpy as np
import cudf
from RdKit_tools import calc_fingerprints,calc_similarity_matrix
from tqdm import tqdm
import time

import os

In [101]:
COL_NAMES=['UID','UNIPROT_ID','COMPOUND_SMILES','PROTEIN_SEQUENCE','CLF_LABEL','Data_Source']
PRT_ROOT = '/gxr/tongyuang/data/CPI/GLX4.1.0/sub_data/Glx_Pubchem_Uniprot_Classification_Data_79980610_proteins/'
DATA_ROOT = '/gxr/tongyuang/data/CPI/GLX4.1.0/sub_data/Glx4.1.0_clf_Pubchem_Uniprot_Classification_Data_79980610/'

In [124]:
output_file_name = './sequence_statistics.csv'
output_df = pd.read_csv(output_file_name)


In [130]:
output_df['sim_mean'] = [-1]*len(output_df)
output_df['sim_max'] = [-1]*len(output_df)
output_df['sim_var'] = [-1]*len(output_df)
output_df.to_csv(output_file_name,index=False)

In [3]:
def gen_similarity_matrix(smlies_list):
    '''
    input: smiles_list
    output: similarity_matrix
    '''
    return calc_similarity_matrix(calc_fingerprints(smlies_list))

In [121]:
def index_to_name(subdata_index):
    '''
    input: int
        0->00
        19->19
    output: str
    '''
    if subdata_index<10:
        return '0'+str(int(subdata_index))
    else:
        return str(int(subdata_index))

def get_smiles_list(prt_idx=0,
                    prt_root= PRT_ROOT,
                    data_root = DATA_ROOT,
                    col_names = COL_NAMES,
                   suffix='.npy'):
    '''
    get smiles list by protein index
    '''
    filename = str(prt_idx)+suffix
    cur_prt_statistics = np.load(prt_root+filename,allow_pickle=True)
    
    subdata_index_list = list(set(cur_prt_statistics[:,0]))
    #print(subdata_index_list)
    
    output_smiles_list = []
    
    for subdata_index in tqdm(subdata_index_list):
        
        subdata_index = int(subdata_index)
        
        rows = cur_prt_statistics[cur_prt_statistics[:,0]==subdata_index][:,1]
        subdata_name = index_to_name(subdata_index)
        
        #tic = time.time()
        #timeArray = time.localtime(tic)
        #print('start loading subdata {} at :'.format(subdata_name)+time.strftime("%Y-%m-%d %H:%M:%S", timeArray))

        if subdata_name=='00':
            cur_df = pd.read_csv(data_root+'part_{}.csv'.format(subdata_name))
        else:
            cur_df = pd.read_csv(data_root+'part_{}.csv'.format(subdata_name),names=col_names)
            
        sub_cur_df = cur_df.iloc[rows]
            
        #print(sub_cur_df.columns)
        smiles_list = list(sub_cur_df['COMPOUND_SMILES'].values)
            
        output_smiles_list += smiles_list
        #toc = time.time()
        #timeArray = time.localtime(toc)
        #print('end loading subdata {} at :'.format(subdata_name)+time.strftime("%Y-%m-%d %H:%M:%S", timeArray))
        #print('Time cost:{:.2f} s'.format(toc-tic))
            #print('Time cost:{:.2f} s'.format(toc-tic))

    
    #print('done')
    return list(set(output_smiles_list))
            

In [119]:
def cal_silimarity_metrics(smiles_list):
    
    fplist = calc_fingerprints(smiles_list)

    sim_matrix = calc_similarity_matrix(fplist)
    
    sim_triup = np.triu(sim_matrix, 1).flatten()
    sim_list = []
    for value in sim_triup:
        if value>0:
            sim_list.append(value)
    
    return {'sim_matrix':sim_matrix,
            'sim_list':sim_list,
            'mean_sim':np.mean(sim_list),
            'max_sim':np.max(sim_list),
            'var_sim':np.var(sim_list)
    }

In [4]:
output_file_name = './sequence_statistics.csv'
output_df = pd.read_csv(output_file_name)


In [17]:
allpos_subdf = output_df[output_df['pos_percent']==1]
allpos_subdf = allpos_subdf[allpos_subdf['length']>2]
allpos_subdf = allpos_subdf.sort_values(by='length')

allneg_subdf = output_df[output_df['pos_percent']==0]
allneg_subdf = allneg_subdf[allneg_subdf['length']>2]
allneg_subdf = allpos_subdf.sort_values(by='length')

In [123]:
allpos_index_list = list(allpos_subdf['idx'].values)


#os.environ['CUDA_VISIBLE_DEVICES']='3,4'

for i in range(3):
    
    smiles_list = get_smiles_list(allpos_index_list[-i-1])
    
    sim_dict = cal_silimarity_metrics(smiles_list)
    print('compound number:{}'.format(len(smiles_list)))
    print('Similarity: mean:{}, max:{}, var:{}'.format(sim_dict['mean_sim'],sim_dict['max_sim'],sim_dict['var_sim']))

100%|██████████| 3/3 [00:12<00:00,  4.27s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

compound number:406
Similarity: mean:0.4521925809658525, max:1.0, var:0.02512432579900244


100%|██████████| 3/3 [00:12<00:00,  4.12s/it]
  0%|          | 0/3 [00:00<?, ?it/s]

compound number:397
Similarity: mean:0.4745511332515871, max:1.0, var:0.017174734600136767


100%|██████████| 3/3 [00:12<00:00,  4.16s/it]


compound number:393
Similarity: mean:0.47465869865463367, max:1.0, var:0.017111709598718723


In [142]:
output_df.iloc[0,7] = 2

output_df

Unnamed: 0,sequence,idx,length,num_pos_lbl,num_neg_lbl,pos_percent,sim_mean,sim_max,sim_var
0,MKFLLVLVLLVSLQVSACGAAPMNESEFAEWYLSRFFDYQGDRIPM...,2378,6,6,0,1.000000,3,2,-1
1,MSARGPAIGIDLGTTYSCVGVFQHGKVEIIANDQGNRTTPSYVAFT...,4117,29,9,20,0.310345,-1,-1,-1
2,MTSTGKDGGAQHAQYVGPYRLEKTLGKGQTGLVKLGVHCVTCQKVA...,939,136,36,100,0.264706,-1,-1,-1
3,MSVTEEDLCHHMKVVVRVRPENTKEKAAGFHKVVHVVDKHILVFDP...,2290,533,520,13,0.975610,-1,-1,-1
4,MEDFVRQCFNPMIVELAEKTMKEYGEDLKIETNKFAAICTHLEVCF...,7515,91,10,81,0.109890,-1,-1,-1
...,...,...,...,...,...,...,...,...,...
7639,MGSAFERVVRRVVQELDHGGEFIPVTSLQSSTGFQPYCLVVRKPSS...,7184,3,3,0,1.000000,-1,-1,-1
7640,MDNGEQDAGFRLAPMSPQEIKPDISLLNENNTSSYSPKPGSPNPFA...,3427,1,1,0,1.000000,-1,-1,-1
7641,MLGAVEGPRWKQAEDIRDIYDFRDVLGTGAFSEVILAEDKRTQKLV...,3092,233,41,192,0.175966,-1,-1,-1
7642,MVGRRALIVLAHSERTSFNYAMKEAAAAALKKKGWEVVESDLYAMN...,202,349,197,152,0.564470,-1,-1,-1


In [143]:
prt_index_list = list(output_df['idx'].values)

for i, idx in enumerate(prt_index_list):

    smiles_list = get_smiles_list(prt_idx=idx)
    
    sim_dict = cal_silimarity_metrics(smiles_list)
    
    output_df.iloc[i,6] = sim_dict['mean_sim']
    output_df.iloc[i,7] = sim_dict['max_sim']
    output_df.iloc[i,8] = sim_dict['var_sim']
    
    if i%20 == 19:
        print('finished {:d} / {:d}'.format(i+1,len(prt_index_list)))
        output_df.to_csv(output_file_name,index=False)
    
    
    

IndentationError: expected an indented block (<ipython-input-143-5602348f781b>, line 16)