#### Data

In [1]:
import pandas as pd
import numpy as np 

import os
import random

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [2]:
random.seed(42)

In [3]:
# read dataset

df = pd.read_csv('./pna_dataset_5.csv')

# generate complete input

for columns in df.columns[1:-2]:
    
    for idx in df.index[:178]:
        
        df.loc[idx,columns] = 0
        
        for i,char in enumerate(list(df.iloc[idx]["sequence"])):
            
            if str(df.iloc[idx]["sequence"])[i:i+len(columns)] == columns[::-1]:
                
                df.loc[idx,columns] += 1

for idx in df.index[179:]:
    df.drop(axis=0,index=idx,inplace=True)


#### Data PreProcessing

In [4]:
# Translate pandas data to numpy array

X = []
y = []

for idx in df.index:
      
    X += [np.array(df[['A', 'T', 'C', 'G', 'AA', 'AT', 'AG', 'AC', 'TT', 'TA', 'TG',
       'TC', 'GG', 'GA', 'GT', 'GC', 'CC', 'CA', 'CT', 'CG']].loc[idx].tolist()).reshape(-1)]
    y += [df['score'].loc[idx]] 


X = np.array(X)
y = np.array(y)

#### Model Selection

In [5]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(max_depth=5, min_samples_leaf=3)
scores = cross_val_score(dt_clf, X, y, cv=5, scoring='accuracy')

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.75 accuracy with a standard deviation of 0.14


In [7]:
# Ramdom Forest

from sklearn.ensemble import RandomForestClassifier

ram_clf = RandomForestClassifier(n_estimators=300, max_depth=9, min_samples_leaf=9)
scores = cross_val_score(ram_clf, X, y, cv=5, scoring='accuracy')
        
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.77 accuracy with a standard deviation of 0.05


In [8]:
# Ridge Classification

from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier()
scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.76 accuracy with a standard deviation of 0.09


In [9]:
# SGD Classifier

from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier()
scores = cross_val_score(sgd_clf, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.78 accuracy with a standard deviation of 0.11


In [10]:
# SVM

from sklearn import svm

svm_clf = svm.SVC(C=0.4,decision_function_shape='ovr')
scores = cross_val_score(svm_clf, X, y, cv=5, scoring='accuracy')

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.76 accuracy with a standard deviation of 0.04


In [11]:
# KNN 
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(n_neighbors=11, weights='distance', p=4)
scores = cross_val_score(knn_clf, X, y, cv=5, scoring='accuracy')  

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))


0.79 accuracy with a standard deviation of 0.07


In [12]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

scores = cross_val_score(gnb, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.79 accuracy with a standard deviation of 0.07


In [13]:
# MLP
from sklearn.neural_network import MLPClassifier

MLP_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(300, 8), random_state=1)
scores = cross_val_score(MLP_clf, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.77 accuracy with a standard deviation of 0.05


In [16]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression as LR

LoRe_clf = LR(penalty="l2", solver="liblinear", C=0.45, max_iter=1000, multi_class="auto")
scores = cross_val_score(LoRe_clf, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.06




In [17]:
# QDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
model = QuadraticDiscriminantAnalysis() 
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.69 accuracy with a standard deviation of 0.10




In [18]:
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=1)
scores = cross_val_score(lda, X, y, cv=5, scoring='accuracy')
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.77 accuracy with a standard deviation of 0.07
