In [6]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB, CategoricalNB, ComplementNB, BernoulliNB

In [7]:
# Combine all PDBs into a single dataframe
dfs = []
for filename in os.listdir('data/features_ring'):
    dfs.append(pd.read_csv('data/features_ring/' + filename, sep='\t'))
df = pd.concat(dfs)
df

Unnamed: 0,pdb_id,s_ch,s_resi,s_ins,s_resn,s_ss8,s_rsa,s_up,s_down,s_phi,...,t_down,t_phi,t_psi,t_ss3,t_a1,t_a2,t_a3,t_a4,t_a5,Interaction
0,2ia7,A,43,,T,-,0.070,18.0,10.0,-1.960,...,15.0,-2.133,2.294,H,-1.343,0.465,-0.862,-1.020,-0.255,HBOND
1,2ia7,A,100,,E,T,0.340,13.0,5.0,-1.992,...,7.0,-1.707,0.184,H,1.357,-1.453,1.477,0.113,-0.837,HBOND
2,2ia7,A,101,,A,G,0.396,5.0,13.0,-1.028,...,12.0,1.326,0.494,L,-0.384,1.652,1.330,1.045,2.064,HBOND
3,2ia7,A,94,,V,E,0.070,16.0,14.0,-2.293,...,20.0,-1.540,2.026,H,1.050,0.302,-3.656,-0.259,-3.242,
4,2ia7,A,70,,T,H,0.099,12.0,10.0,-1.011,...,15.0,-1.051,-0.793,H,-1.239,-0.547,2.131,0.393,0.816,HBOND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
422,1pjx,A,286,,E,E,0.000,25.0,21.0,-2.710,...,20.0,-2.317,3.017,H,-0.595,0.009,0.672,-2.128,-0.184,VDW
423,1pjx,A,217,,W,E,0.048,20.0,9.0,-1.497,...,17.0,1.241,-2.621,L,-0.384,1.652,1.330,1.045,2.064,
424,1pjx,A,18,,P,T,0.390,11.0,21.0,-1.045,...,25.0,-2.135,1.339,H,-0.591,-1.302,-0.733,1.570,-0.146,
425,1pjx,A,47,,E,E,0.191,16.0,23.0,-2.287,...,11.0,-2.654,2.382,H,-1.343,0.465,-0.862,-1.020,-0.255,


In [8]:
# Remove all rows with NaN in at least one column
# including rows with missing class (they could be false negatives)
df.dropna(inplace=True)

# Define ground truth values
y = df['Interaction'].astype('category')
y

0      HBOND
1      HBOND
2      HBOND
4      HBOND
5        VDW
       ...  
416      VDW
417      VDW
418      VDW
421    HBOND
422      VDW
Name: Interaction, Length: 454193, dtype: category
Categories (6, object): ['HBOND', 'IONIC', 'PICATION', 'PIPISTACK', 'SSBOND', 'VDW']

In [9]:
# Define training features
X = df[['s_rsa', 's_up', 's_down', 's_phi', 's_psi', 's_a1', 's_a2', 's_a3', 's_a4', 's_a5', 
        't_rsa', 't_up', 't_down', 't_phi', 't_psi', 't_a1', 't_a2', 't_a3', 't_a4', 't_a5']]

# Calculate percentiles and transform into categories
X = X.rank(pct=True).round(1).astype('category') 
X

Unnamed: 0,s_rsa,s_up,s_down,s_phi,s_psi,s_a1,s_a2,s_a3,s_a4,s_a5,t_rsa,t_up,t_down,t_phi,t_psi,t_a1,t_a2,t_a3,t_a4,t_a5
0,0.4,0.6,0.1,0.2,0.7,0.5,0.7,0.9,0.7,0.8,0.1,0.7,0.4,0.1,0.8,0.0,0.8,0.3,0.1,0.4
1,0.8,0.4,0.0,0.2,0.6,0.9,0.1,0.7,0.5,0.4,0.9,0.1,0.1,0.3,0.6,0.8,0.1,0.7,0.5,0.4
2,0.8,0.1,0.2,0.8,0.4,0.4,0.1,0.4,1.0,0.5,0.7,0.0,0.3,1.0,0.6,0.5,1.0,0.6,0.7,0.9
4,0.5,0.3,0.1,0.9,0.4,0.5,0.7,0.9,0.7,0.8,0.3,0.7,0.4,0.8,0.1,0.1,0.4,0.8,0.5,0.6
5,0.2,0.6,0.1,0.7,0.1,0.1,0.4,0.8,0.5,0.6,0.4,0.9,0.1,0.1,0.9,0.1,0.4,0.8,0.5,0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,0.4,0.4,0.9,0.3,0.9,0.3,0.3,0.8,0.2,0.6,0.6,0.7,0.6,0.6,0.8,1.0,0.4,0.5,0.3,0.9
417,0.5,0.2,0.6,1.0,0.6,0.7,0.8,0.6,0.4,0.7,0.6,0.7,0.6,0.6,0.8,1.0,0.4,0.5,0.3,0.9
418,0.9,0.2,0.2,0.2,0.8,0.5,0.7,0.9,0.7,0.8,0.5,0.6,0.7,0.1,1.0,0.3,0.7,0.5,0.0,0.4
421,0.8,0.0,0.8,0.4,0.8,0.8,0.7,0.1,0.3,0.0,0.9,0.0,0.5,0.0,1.0,0.8,0.1,0.7,0.5,0.4


In [10]:
# Split the dataset to define training and testing examples
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

### Test different versions of Naive Bayes

In [11]:
nb = GaussianNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 17854


In [12]:
nb = MultinomialNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 15403


In [13]:
nb = ComplementNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 16986


In [14]:
nb = BernoulliNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 15138


In [15]:
nb = CategoricalNB()
y_pred = nb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

Number of mislabeled points out of a total 45420 points : 15491
