### importing

In [1]:
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

### Methods

In [2]:
def printMetrics(y_test,y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print('Accuracy score:', accuracy_score(y_test, y_pred), '\t|\tSpecificity:', tn/(tn+fp),'\nSensitivity:', tp/(tp+fn), '\t|\tMCC:', matthews_corrcoef(y_test, y_pred),'\n')

### pfeatures for dynamic fasta file

In [3]:
def pFeatures(sequence):
    sequence = sequence.strip('\n')
    use_list = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
    result = []
    for i in use_list:
        x = list(sequence).count(i)/len(sequence)*100
        result.append(x)
    return result

### Loading and reading files

In [4]:
f = open('data/4/pos.txt', 'r')
lines = f.readlines()
pos = np.array(pFeatures(lines[0]))
if (len(lines) > 1):
    for i in range(1, len(lines)):
        pos = np.vstack((pos, np.array(pFeatures(lines[i]))))


In [5]:
f = open('data/4/neg.txt', 'r')
lines = f.readlines()
neg = np.array(pFeatures(lines[0]))
if (len(lines) > 1):
    for i in range(1, len(lines)):
        neg = np.vstack((neg, np.array(pFeatures(lines[i]))))


### pos and neg samples

In [6]:
pos,neg= np.insert(pos,20,1,axis=1), np.insert(neg,20,0,axis=1)

In [7]:
data = np.concatenate((pos,neg), axis=0)

In [8]:
X, y = data[:,0:19], data[:,20]

### splitting

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=29)

## Models

### SVM

In [10]:
print ('\nSVM model Results')
svc_model = SVC(gamma='auto', kernel='linear')
svc_model.fit(X_train, y_train)
printMetrics(y_test, svc_model.predict(X_test))


SVM model Results
Accuracy score: 0.9414141414141414 	|	Specificity: 0.9933774834437086 
Sensitivity: 0.38095238095238093 	|	MCC: 0.5429351340016677 



### Artificial Neural Network(ANN)

In [11]:
print ('\nANN model Results')
mlp_model = MLPClassifier(max_iter=400)
mlp_model.fit(X_train, y_train)
printMetrics(y_test,mlp_model.predict(X_test))


ANN model Results
Accuracy score: 0.9595959595959596 	|	Specificity: 0.9845474613686535 
Sensitivity: 0.6904761904761905 	|	MCC: 0.7243277821112698 



### Random Forest

In [12]:
print ('\nRandom Forest model Results')
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
rfc.fit(X_train, y_train)
printMetrics(y_test,rfc.predict(X_test))


Random Forest model Results
Accuracy score: 0.9656565656565657 	|	Specificity: 1.0 
Sensitivity: 0.5952380952380952 	|	MCC: 0.7574352821772499 

