#### Data

In [13]:
import pandas as pd
import numpy as np      
from matplotlib import pylab as plt     
import json
import random
import os

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from scipy import stats
from sklearn.model_selection import cross_val_score

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
# set train-test-split ratio
TRAIN_SPLIT = .80   
VAL_SPLIT = 0.20    

SEED = 108

In [4]:
# Read dataset'pna_dataset.csv'

df = pd.read_csv('./pna_dataset.csv')

for columns in df.columns[1:-2]:
    
    for idx in df.index[:178]:
        
        df.loc[idx,columns] = 0
        
        for i,char in enumerate(list(df.iloc[idx]["sequence"])):
            
            if str(df.iloc[idx]["sequence"])[i:i+len(columns)] == columns[::-1]:
                
                df.loc[idx,columns] += 1

for idx in df.index[179:]:
    df.drop(axis=0,index=idx,inplace=True)

# Write into file
df.to_csv('pna_dataset.csv', index=False)

df

Unnamed: 0,No.,A,T,C,G,AA,AT,AG,AC,TT,...,GG,GA,GT,GC,CC,CA,CT,CG,sequence,score
0,Secquence 2,3.0,8.0,1.0,1.0,0.0,3.0,0.0,0.0,3.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,TTATTTATGTACT,1
1,Secquence 3,2.0,10.0,1.0,0.0,0.0,2.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,TCTTTTTATTTAT,1
2,Secquence 4,3.0,8.0,2.0,0.0,1.0,1.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,AATCTTTTATTTC,1
3,Secquence 6,2.0,7.0,3.0,1.0,1.0,0.0,0.0,1.0,4.0,...,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,GTTCCAATTCTTT,1
4,Secquence 8,1.0,6.0,4.0,2.0,0.0,0.0,0.0,1.0,2.0,...,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,CTTCTCACTTGGT,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,Secquence 195,7.0,0.0,5.0,1.0,4.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,1.0,0.0,3.0,0.0,1.0,CACAAAAACGCAC,-1
101,Secquence 202,2.0,5.0,0.0,6.0,0.0,0.0,1.0,0.0,1.0,...,2.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,AGTTGTGGTGGAT,-1
102,Secquence 203,2.0,6.0,0.0,5.0,0.0,0.0,1.0,0.0,2.0,...,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,AGTTGTTGTGGAT,-1
103,Secquence 204,3.0,4.0,0.0,6.0,0.0,1.0,2.0,0.0,1.0,...,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,TAGTTGGAGTGGA,-1


#### Data PreProcessing

In [5]:
X_sequences = []
X = []
Y = []

for idx in df.index:
    X_sequences += [idx]
    X += [np.array(df[['A', 'T', 'C', 'G', 'AA', 'AT', 'AG', 'AC', 'TT', 'TA', 'TG',
       'TC', 'GG', 'GA', 'GT', 'GC', 'CC', 'CA', 'CT', 'CG']].loc[idx].tolist()).reshape(-1)]
    Y += [df['score'].loc[idx]]

# Convert into numpy arrays
X_sequences = np.array(X_sequences)
X = np.array(X)
Y = np.array(Y)

In [6]:
from sklearn.model_selection import train_test_split

X_1 = X
Y_1 = Y

X_train,X_test,Y_train,Y_test = train_test_split(X_1,Y_1,test_size=0.2)

#### Model

In [8]:
# model1: Decision Tree
from sklearn.tree import DecisionTreeClassifier


dt_clf = DecisionTreeClassifier(max_depth=3, min_samples_leaf=1)
dt_clf = dt_clf.fit(X_train, Y_train)
dt_score = dt_clf.score(X_test, Y_test)

scores = cross_val_score(dt_clf, X_1, Y_1, cv=5, scoring='accuracy')
print(i,"%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

12 0.66 accuracy with a standard deviation of 0.06


In [9]:
# model2: random forest

from sklearn.ensemble import RandomForestClassifier
best_score = 0.5
best_std = 0
for i in range(1,10):
    for j in range(1,10):
        ram_clf = RandomForestClassifier(n_estimators=300, max_depth = i, min_samples_leaf=j)
        scores = cross_val_score(ram_clf, X_1, Y_1, cv=5, scoring='accuracy')
        
        if best_score < scores.mean():
            best_score = scores.mean()
            best_std = scores.std()

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.78 accuracy with a standard deviation of 0.05


In [10]:
# model3: Ridge Classification

from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier().fit(X_train, Y_train)

scores = cross_val_score(clf, X_1, Y_1, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.76 accuracy with a standard deviation of 0.09


In [11]:
# model4: SGD Classifier

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, Y_train)

scores = cross_val_score(sgd_clf, X_1, Y_1, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


0.71 accuracy with a standard deviation of 0.07


In [16]:
# model5: SVM

from sklearn import svm
from sklearn.model_selection import cross_val_score

svm_clf = svm.SVC(C=10, decision_function_shape='ovr')
svm_clf.fit(X_train, Y_train)
scores = cross_val_score(svm_clf, X, Y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.76 accuracy with a standard deviation of 0.05


In [17]:
# model6: KNN 
from sklearn.neighbors import KNeighborsClassifier

transfer = StandardScaler()
X_train = transfer.fit_transform(X_train)
X_test = transfer.transform(X_test)

estimator = KNeighborsClassifier(n_neighbors=3)
estimator.fit(X_train,Y_train)

y_predict = estimator.predict(X_test)

score = estimator.score(X_test, Y_test)
print("accuracy ", score)   

accuracy  0.7142857142857143


In [18]:
# Optimization of KNN Parameters

best_score=0.0
best_k=-1
best_weight ='distance'
best_p=0
for j in range(1,6):
    for i in range(1,11): 

            knn_clf = KNeighborsClassifier(n_neighbors=i, weights=best_weight, p=j)
            knn_clf.fit(X_train, Y_train)
            score = knn_clf.score(X_test, Y_test)
            
            if best_score < score:
                best_score=score
                best_k = i
                best_p = j              

print('when best_weight = {}：'.format (best_weight))
print('best_score ={}'.format (best_score))
print('best_k = {}'. format(best_k))
print('best_p = {}'. format(best_p))

when best_weight = distance：
best_score =0.7619047619047619
best_k = 1
best_p = 1


In [20]:
for i in range (1,30):
    for j in range(1,10):
        knn_clf = KNeighborsClassifier(n_neighbors=i, weights=best_weight, p=j)
        scores = cross_val_score(knn_clf, X_1, Y_1, cv=5, scoring='accuracy')
        print(i,j,"%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

1 1 0.71 accuracy with a standard deviation of 0.04
1 2 0.70 accuracy with a standard deviation of 0.04
1 3 0.72 accuracy with a standard deviation of 0.07
1 4 0.72 accuracy with a standard deviation of 0.07
1 5 0.72 accuracy with a standard deviation of 0.07
1 6 0.72 accuracy with a standard deviation of 0.07
1 7 0.72 accuracy with a standard deviation of 0.07
1 8 0.72 accuracy with a standard deviation of 0.07
1 9 0.72 accuracy with a standard deviation of 0.07
2 1 0.71 accuracy with a standard deviation of 0.08
2 2 0.71 accuracy with a standard deviation of 0.07
2 3 0.71 accuracy with a standard deviation of 0.04
2 4 0.71 accuracy with a standard deviation of 0.04
2 5 0.71 accuracy with a standard deviation of 0.04
2 6 0.71 accuracy with a standard deviation of 0.04
2 7 0.71 accuracy with a standard deviation of 0.04
2 8 0.71 accuracy with a standard deviation of 0.04
2 9 0.71 accuracy with a standard deviation of 0.04
3 1 0.75 accuracy with a standard deviation of 0.05
3 2 0.79 acc

18 5 0.73 accuracy with a standard deviation of 0.07
18 6 0.73 accuracy with a standard deviation of 0.07
18 7 0.73 accuracy with a standard deviation of 0.07
18 8 0.74 accuracy with a standard deviation of 0.08
18 9 0.74 accuracy with a standard deviation of 0.08
19 1 0.75 accuracy with a standard deviation of 0.06
19 2 0.74 accuracy with a standard deviation of 0.09
19 3 0.74 accuracy with a standard deviation of 0.09
19 4 0.74 accuracy with a standard deviation of 0.07
19 5 0.73 accuracy with a standard deviation of 0.08
19 6 0.72 accuracy with a standard deviation of 0.08
19 7 0.72 accuracy with a standard deviation of 0.08
19 8 0.72 accuracy with a standard deviation of 0.08
19 9 0.72 accuracy with a standard deviation of 0.08
20 1 0.78 accuracy with a standard deviation of 0.05
20 2 0.76 accuracy with a standard deviation of 0.07
20 3 0.75 accuracy with a standard deviation of 0.06
20 4 0.74 accuracy with a standard deviation of 0.07
20 5 0.73 accuracy with a standard deviation o

In [21]:
# model7: Naive Bayes

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
y_pred = gnb.fit(X_train, Y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"% (X_test.shape[0], (Y_test != y_pred).sum()))

scores = cross_val_score(gnb, X_1, Y_1, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

Number of mislabeled points out of a total 21 points : 5
0.79 accuracy with a standard deviation of 0.07


In [22]:
# model8: Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier

MLP_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(300, 5), random_state=1)
MLP_clf.fit(X_train, Y_train)
scores = cross_val_score(MLP_clf, X, Y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.81 accuracy with a standard deviation of 0.05


In [23]:
# model9: Logistic Regression
from sklearn.linear_model import LogisticRegression as LR

LoRe_clf = LR(penalty="l2", solver="liblinear", C=0.45, max_iter=1000, multi_class="auto")

scores = cross_val_score(LoRe_clf, X_1, Y_1, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


0.84 accuracy with a standard deviation of 0.06
