In [1]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
sns.despine()
from scipy import interp
from statistics import mean
import matplotlib.pyplot as plt
from collections import OrderedDict

import sys
sys.path.append('../../scripts')
from helper_functions import *

### Load LIG dataset generated from CATH FunFams

In [2]:
raw_feature_data = pd.read_csv('LIG_feature_table.csv')
raw_feature_data.shape

(451839, 171)

In [3]:
raw_feature_data.columns.tolist()

#### NOTE: For Metal-binding Funsite Predictor, restrict dataset to metal ligands only

In [4]:
#### For Metal-binding Funsite Predictor
#raw_feature_data = raw_feature_data[raw_feature_data['annotation_BIOLIP_ligand'].isin(['ZN', 'CA', 'MG', 'MN', 'CU', 'K', 'FE', 'FE2', 'CO', 'NA', 'IOD', 'XE', 'NI', 'NO_LIGAND'])]
#raw_feature_data.shape

In [5]:
raw_feature_data = raw_feature_data.drop(['Unnamed: 170'], axis =1)
raw_feature_data['domain'], raw_feature_data['domain_residue'] = raw_feature_data['residue_string'].str.split('_', 1).str
raw_feature_data['dompdbchain_res'] = raw_feature_data['residue_string'].str[:5] + '_' + raw_feature_data['domain_residue']

raw_feature_data = raw_feature_data[~raw_feature_data['annotation_BIOLIP_ligand'].isin(['NUC', 'III', 'UUU'])]

raw_feature_data['dssp_type'] = raw_feature_data['dssp_type'].fillna("NO_PRED")
raw_feature_data['surface_residue_rsa'] = (raw_feature_data['rsa_allatoms'] >= 10).astype(int)
raw_feature_data['highly_conserved'] = (raw_feature_data['scons'] >= 0.7).astype(int)
raw_feature_data['cleft_residue'] = (raw_feature_data['cleft_num'] > 0).astype(int)
raw_feature_data['hydrophobic_aa'] = (raw_feature_data['hydrophobicity'] >= 0.48).astype(int)
raw_feature_data['polar_aa'] = (raw_feature_data['polarity'] >= 10).astype(int)
raw_feature_data['res_bfactor_n'] = raw_feature_data['res_bfactor_n'].astype(float)
raw_feature_data['entwop_score_ff'] = raw_feature_data['entwop_score_ff'].astype(float)
raw_feature_data['entwop_score_psiblast'] = raw_feature_data['entwop_score_psiblast'].astype(float)
mindist = raw_feature_data[['min_dist_to_cleft_1','min_dist_to_cleft_2','min_dist_to_cleft_3']].min(axis=1)
raw_feature_data = raw_feature_data.assign(min_dist_to_cleft123=mindist)
raw_feature_data = pd.get_dummies(raw_feature_data, columns=['residue_aa', 'dssp_type'])
# Remove any duplicate samples. 
raw_feature_data = raw_feature_data.drop_duplicates()
# Remove NA rows
raw_feature_data = raw_feature_data.dropna()
# Count no. of domains in whole dataset
raw_feature_data.groupby(['domain']).size().shape[0]

2863

In [6]:
# Get list of LIG domains
lig_domains = raw_feature_data['domain'].unique().tolist()

In [7]:
# Only use domains at NR sequence identity <= 60% 

nr_lig_domains = []

# open file and read the content in a list
with open('NR60_lig_domains.txt', 'r') as filehandle:  
    for line in filehandle:
        # remove linebreak which is the last character of the string
        currentPlace = line[:-1]

        # add item to the list
        nr_lig_domains.append(currentPlace)
        
raw_feature_data=raw_feature_data[raw_feature_data['domain'].isin(nr_lig_domains)]
raw_feature_data.groupby(['domain']).size().shape[0]

2863

### Generate LIG benchmark and validation datasets

In [8]:
concavity_df = pd.read_table('Concavity_results.txt', names = ['pdb', 'chain', 'pdbchain', 'pdbchain_res', 'res', 'aa','concavity_score'])
concavity_df = concavity_df.drop(['pdb', 'chain', 'pdbchain', 'res', 'aa'], axis =1)
concavity_df.head(2)

  """Entry point for launching an IPython kernel.


Unnamed: 0,pdbchain_res,concavity_score
0,1a5uC_1,0.0
1,1a5uC_2,0.0


In [9]:
raw_feature_data_concavity =raw_feature_data.merge(concavity_df, left_on='dompdbchain_res', right_on='pdbchain_res', how='inner')
site_doms_concavity = raw_feature_data_concavity.domain.unique()
validation_set = raw_feature_data_concavity

raw_feature_data = raw_feature_data[~raw_feature_data['domain'].isin(site_doms_concavity)]

In [10]:
def preprocess_site_nonsite_df(site_data, nonsite_data, ratio):
        
    # Get the list of domains that have site annotations
    positive_sample_num = site_data.shape[0]
    print('#Postive samples:',positive_sample_num)
    
    # make a subset dataframe containing only NOSITE (negative) residues
    negative_sample_num = nonsite_data.shape[0]
    print('#Negative samples:',negative_sample_num)
    
    print ('Use these samples for training the model:')
    use_sample_num = positive_sample_num * ratio
        
    if (use_sample_num > negative_sample_num):
        use_sample_num = negative_sample_num
    
    total_samples = use_sample_num + positive_sample_num
    
    print ('- Used negative samples:',use_sample_num)
    print ('- Total samples:',total_samples)
    
    dom_groups_df = nonsite_data.groupby(['domain']).size()
    dom_group_num = dom_groups_df.shape[0]
    print ('- No. of groups of samples:',dom_group_num)
    
    sample_size = round(use_sample_num/dom_group_num)

    print ('- Min. sample size in no_sites:', sample_size)
    
    nonsite_data = nonsite_data.groupby(['domain']).filter(lambda x: len(x) > sample_size)
    nonsite_data_randomsubset=nonsite_data.groupby('domain').apply(lambda x: x.sample(n=sample_size, random_state=10)).reset_index(drop=True)
    
    # COMBINE selected csa and non-csa data for the desired dataset ratio
    frames = [site_data, nonsite_data_randomsubset]
    concatenated_feature_data = pd.concat(frames)
    dataset_sample_num = concatenated_feature_data.shape[0]
    feature_data_ML = concatenated_feature_data.set_index('index').sample(n=dataset_sample_num, random_state=10)
    feature_data_ML.index.name = None
    
    return(feature_data_ML)

In [11]:
# Training dataset
training_data=raw_feature_data
SITE_data = training_data[(training_data.annotation_BIOLIP == 1) ]
site_doms = SITE_data.domain.unique()
NOSITE_data = training_data[(training_data.annotation_BIOLIP == 0 ) & training_data.domain.isin(site_doms)] 
NOSITE_data['index']=NOSITE_data['residue_string']

training_data_ML = preprocess_site_nonsite_df(SITE_data, NOSITE_data, 6)
training_data_ML = training_data_ML.drop_duplicates()
print(len(training_data_ML.domain.unique()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#Postive samples: 13870
#Negative samples: 310967
Use these samples for training the model:
- Used negative samples: 83220
- Total samples: 97090
- No. of groups of samples: 2026
- Min. sample size in no_sites: 41


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




2026


In [12]:
SITE_data_val = validation_set[(validation_set.annotation_BIOLIP == 1) ]#| (raw_feature_data.annotation_3DID_INTRACHAIN == 1
site_doms_val = SITE_data_val.domain.unique()

NOSITE_data_val = validation_set[(validation_set.annotation_BIOLIP == 0 ) & (validation_set.domain.isin(site_doms_val)) ]
NOSITE_data_val['index']=NOSITE_data_val['residue_string']

validation_data_ML = preprocess_site_nonsite_df(SITE_data_val, NOSITE_data_val, 6)
validation_data_ML =validation_data_ML.drop_duplicates()
print(len(validation_data_ML.domain.unique()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


#Postive samples: 5634
#Negative samples: 103821
Use these samples for training the model:
- Used negative samples: 33804
- Total samples: 39438
- No. of groups of samples: 800
- Min. sample size in no_sites: 42


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




800


In [13]:
training_data_ML.to_csv('LIG_training_dataset.csv', index=False)

In [14]:
validation_data_ML.to_csv('LIG_validation_dataset.csv', index=False)