In [1]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
sns.despine()
from scipy import interp
from statistics import mean
import matplotlib.pyplot as plt
from collections import OrderedDict

import sys
sys.path.append('../../scripts')
from helper_functions import *

### Load PPI dataset generated from CATH FunFams

In [2]:
raw_feature_data = pd.read_csv('PPI_feature_table.csv')
raw_feature_data.shape

(574283, 171)

In [3]:
raw_feature_data.columns.tolist()

In [4]:
# feature engineering
raw_feature_data = raw_feature_data.drop(['Unnamed: 170'], axis =1)
raw_feature_data['domain'], raw_feature_data['domain_residue'] = raw_feature_data['residue_string'].str.split('_', 1).str
raw_feature_data['dssp_type'] = raw_feature_data['dssp_type'].fillna("NO_PRED")
raw_feature_data['surface_residue_rsa'] = (raw_feature_data['rsa_allatoms'] >= 25).astype(int)
raw_feature_data['surface_residue_rsa10'] = (raw_feature_data['rsa_allatoms'] >= 10).astype(int)
raw_feature_data['highly_conserved'] = (raw_feature_data['scons'] >= 0.7).astype(int)
raw_feature_data['cleft_residue'] = (raw_feature_data['cleft_num'] > 0).astype(int)
raw_feature_data['hydrophobic_aa'] = (raw_feature_data['hydrophobicity'] >= 0.48).astype(int)
raw_feature_data['polar_aa'] = (raw_feature_data['polarity'] >= 10).astype(int)
raw_feature_data['res_bfactor_n'] = raw_feature_data['res_bfactor_n'].astype(float)
raw_feature_data['entwop_score_ff'] = raw_feature_data['entwop_score_ff'].astype(float)
raw_feature_data['entwop_score_psiblast'] = raw_feature_data['entwop_score_psiblast'].astype(float)
mindist = raw_feature_data[['min_dist_to_cleft_1','min_dist_to_cleft_2','min_dist_to_cleft_3']].min(axis=1)
raw_feature_data = raw_feature_data.assign(min_dist_to_cleft123=mindist)
raw_feature_data = pd.get_dummies(raw_feature_data, columns=['residue_aa', 'dssp_type'])
# Remove any duplicate samples. 
raw_feature_data = raw_feature_data.drop_duplicates()
# Remove NA rows
raw_feature_data = raw_feature_data.dropna()
# Count no. of domains in whole dataset
raw_feature_data.groupby(['domain']).size().shape[0]

3518

In [5]:
# Get list of PPI domains
ppi_domains = raw_feature_data['domain'].unique().tolist()

# Only use domains at NR sequence identity <= 60% 
nr_ppi_domains = []

# open file and read the content in a list
with open('NR60_ppi_domains.txt', 'r') as filehandle:  
    for line in filehandle:
        # remove linebreak which is the last character of the string
        currentPlace = line[:-1]

        # add item to the list
        nr_ppi_domains.append(currentPlace)
        

raw_feature_data=raw_feature_data[raw_feature_data['domain'].isin(nr_ppi_domains)]
raw_feature_data.groupby(['domain']).size().shape[0]

3502

### Generate PPI benchmark and validation datasets

In [6]:
validation_set_metappisp = pd.read_csv('Meta-ppisp.results.tsv', names=['val_dom', 'val_res', 'val_dom_res', 'cons_ppisp', 'pinup', 'promate', 'meta_ppisp', 'meta_predict'], sep = '\t')
validation_set_metappisp['cons_ppisp'] = validation_set_metappisp['cons_ppisp'].fillna(0)
validation_set_metappisp['pinup'] = validation_set_metappisp['pinup'].fillna(0)
validation_set_metappisp['promate'] = validation_set_metappisp['promate'].fillna(0)
validation_set_metappisp['meta_ppisp'] = validation_set_metappisp['meta_ppisp'].fillna(0)
validation_set_metappisp['meta_predict'] = validation_set_metappisp['meta_predict'].fillna(0)
validation_set_metappisp.sort_values('meta_ppisp').head(5)

Unnamed: 0,val_dom,val_res,val_dom_res,cons_ppisp,pinup,promate,meta_ppisp,meta_predict
106624,1lvaA02,505,1lvaA02_505,0.009,0.0,0.0,-0.17,0
30572,1dleA02,186B,1dleA02_186B,0.009,0.0,0.0,-0.167,0
19029,1cc5A00,87,1cc5A00_87,0.008,0.0,0.0,-0.166,0
30574,1dleA02,186D,1dleA02_186D,0.003,0.0,0.0,-0.157,0
30612,1dleA02,220I,1dleA02_220I,0.215,0.0,0.0,-0.152,0


In [7]:
validation_set_metappisp=validation_set_metappisp.dropna()
validation_set_metappisp.groupby(['val_dom']).size().shape[0]
validation_domlist = validation_set_metappisp.val_dom.unique().tolist()

In [8]:
training_data=raw_feature_data[~raw_feature_data['domain'].isin(validation_domlist)]
training_data.groupby(['domain']).size().shape[0]

2746

In [9]:
validation_data=raw_feature_data[raw_feature_data['domain'].isin(validation_domlist)]
validation_data.groupby(['domain']).size().shape[0]

756

In [10]:
validation_data_ppisp = validation_data.set_index('residue_string').join(validation_set_metappisp.set_index('val_dom_res'), how = 'inner')
validation_data_ppisp = validation_data_ppisp.reset_index()
validation_data_ppisp.head(5)

Unnamed: 0,index,A_pssm_ff,A_pssm_psiblast,A_wop_ff,A_wop_psiblast,C_pssm_ff,C_pssm_psiblast,C_wop_ff,C_wop_psiblast,D_pssm_ff,...,dssp_type_H,dssp_type_NO_PRED,dssp_type_T,val_dom,val_res,cons_ppisp,pinup,promate,meta_ppisp,meta_predict
0,12asA00_10,-2.0,-2.0,1.0,0.0,-4.0,-4.0,0.0,0.0,-1.0,...,1,0,0,12asA00,10,0.012,0.49,0.292,0.332,0
1,12asA00_100,-5.0,-3.0,0.0,0.0,-7.0,-5.0,0.0,0.0,-5.0,...,0,1,0,12asA00,100,0.243,0.09,0.316,0.136,0
2,12asA00_101,1.0,-2.0,12.0,0.0,-5.0,-4.0,0.0,0.0,-4.0,...,0,0,1,12asA00,101,0.798,0.19,0.308,0.275,0
3,12asA00_102,-5.0,-3.0,0.0,1.0,1.0,-4.0,3.0,0.0,8.0,...,0,0,1,12asA00,102,0.346,0.09,0.296,0.151,0
4,12asA00_103,-4.0,-2.0,0.0,0.0,-6.0,-5.0,0.0,0.0,1.0,...,0,1,0,12asA00,103,0.055,0.09,0.319,0.157,0


In [11]:
SITE_data = training_data[(training_data.annotation_IBIS_PPI_INTERCHAIN == 1) ]#
site_doms = SITE_data.domain.unique()
NOSITE_data = training_data[(training_data.annotation_IBIS_PPI_INTERCHAIN == 0) & training_data.domain.isin(site_doms)]
SITE_data['index']=SITE_data['residue_string']
NOSITE_data['index']=NOSITE_data['residue_string']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [12]:
def preprocess_site_nonsite_df(site_data, nonsite_data, ratio):
        
    # Get the list of domains that have site annotations
    positive_sample_num = site_data.shape[0]
    print('#Postive samples:',positive_sample_num)
    
    # make a subset dataframe containing only NOSITE (negative) residues
    negative_sample_num = nonsite_data.shape[0]
    print('#Negative samples:',negative_sample_num)
    
    print ('Use these samples for training the model:')
    use_sample_num = positive_sample_num * ratio
        
    if (use_sample_num > negative_sample_num):
        use_sample_num = negative_sample_num
    
    total_samples = use_sample_num + positive_sample_num
    
    print ('- Used negative samples:',use_sample_num)
    print ('- Total samples:',total_samples)
    
    dom_groups_df = nonsite_data.groupby(['domain']).size()
    dom_group_num = dom_groups_df.shape[0]
    print ('- No. of groups of samples:',dom_group_num)
    
    sample_size = round(use_sample_num/dom_group_num)

    print ('- Min. sample size in no_sites:', sample_size)
    
    nonsite_data = nonsite_data.groupby(['domain']).filter(lambda x: len(x) > sample_size)
    nonsite_data_randomsubset=nonsite_data.groupby('domain').apply(lambda x: x.sample(n=sample_size, random_state=10)).reset_index(drop=True)
    
    # COMBINE selected csa and non-csa data for the desired dataset ratio
    frames = [site_data, nonsite_data_randomsubset]
    concatenated_feature_data = pd.concat(frames)
    dataset_sample_num = concatenated_feature_data.shape[0]
    feature_data_ML = concatenated_feature_data.set_index('index').sample(n=dataset_sample_num, random_state=10)
    feature_data_ML.index.name = None
    
    return(feature_data_ML)

In [13]:
training_data_ML = preprocess_site_nonsite_df(SITE_data, NOSITE_data, 4)

#Postive samples: 61653
#Negative samples: 238156
Use these samples for training the model:
- Used negative samples: 238156
- Total samples: 299809
- No. of groups of samples: 1850
- Min. sample size in no_sites: 129


In [14]:
training_data_ML = training_data_ML.drop_duplicates()
training_data_ML.shape

(159822, 201)

In [15]:
SITE_data_val = validation_data_ppisp[(validation_data_ppisp.annotation_IBIS_PPI_INTERCHAIN == 1) ]
site_doms_val = SITE_data_val.domain.unique()
NOSITE_data_val = validation_data_ppisp[(validation_data_ppisp.annotation_IBIS_PPI_INTERCHAIN == 0 ) & (validation_data_ppisp.domain.isin(site_doms_val)) ]

In [16]:
validation_data_ML = preprocess_site_nonsite_df(SITE_data_val, NOSITE_data_val, 4)

#Postive samples: 16278
#Negative samples: 59992
Use these samples for training the model:
- Used negative samples: 59992
- Total samples: 76270
- No. of groups of samples: 489
- Min. sample size in no_sites: 123


In [17]:
validation_data_ML.groupby(['domain']).size().shape[0]

489

In [18]:
validation_data_ML =validation_data_ML.drop_duplicates()
validation_data_ML.shape

(40140, 207)

In [19]:
training_data_ML.to_csv('PPI_training_dataset.csv', index=False)
validation_data_ML.to_csv('PPI_validation_dataset.csv', index=False)