In [None]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def datasetLoad(posFile, negFile):
  x_train = np.concatenate((np.load(posFile), np.load(negFile)), axis = 0)

  return x_train


In [None]:
#location
kmer_nac_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/Kmer 1.npy","/content/drive/My Drive/npy File All/negative 5289/Kmer 1.npy")
kmer_dnc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/Kmer 2.npy","/content/drive/My Drive/npy File All/negative 5289/Kmer 2.npy")
kmer_tnc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/Kmer 3.npy","/content/drive/My Drive/npy File All/negative 5289/Kmer 3.npy")
enac_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/ENAC 5.npy","/content/drive/My Drive/npy File All/negative 5289/ENAC 5.npy")
pseknc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/PseKNC.npy","/content/drive/My Drive/npy File All/negative 5289/PseKNC.npy") 
pcpsednc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/PCPseDNC.npy","/content/drive/My Drive/npy File All/negative 5289/PCPseDNC.npy") 

y_train = np.concatenate((np.tile(1, 5289), np.tile(0, 5289)))

In [None]:
print(kmer_nac_x.shape)
print(kmer_dnc_x.shape)
print(kmer_tnc_x.shape)
print(enac_x.shape)
print(pseknc_x.shape)
print(pcpsednc_x.shape)
print(y_train.shape)

(10578, 4)
(10578, 16)
(10578, 64)
(10578, 148)
(10578, 66)
(10578, 18)
(10578,)


In [None]:
x_train = np.concatenate((kmer_nac_x, kmer_dnc_x), axis = 1)
x_train = np.concatenate((x_train, kmer_tnc_x), axis = 1)
x_train = np.concatenate((x_train, enac_x), axis = 1)
x_train = np.concatenate((x_train, pseknc_x), axis = 1)
x_train = np.concatenate((x_train, pcpsednc_x), axis = 1)

print(x_train.shape)

(10578, 316)


In [None]:
feature_group_64 = {
    "nac": 0,
    "dnc": 0,
    "tnc": 0,
    "enac": 0,
    "pseknc": 0,
    "pcpsednc": 0
}
feature_group_32 = {
    "nac": 0,
    "dnc": 0,
    "tnc": 0,
    "enac": 0,
    "pseknc": 0,
    "pcpsednc": 0
}

In [None]:
#64 feature
X_64_best= SelectKBest(chi2, k=64).fit(x_train, y_train)
mask = X_64_best.get_support()

count = 0
for x in mask:
  if x:
    if count >= 0 and count <= 3:
      feature_group_64["nac"] = feature_group_64["nac"] + 1
    elif count >= 4 and count <= 19:
      feature_group_64["dnc"] = feature_group_64["dnc"] + 1
    elif count >= 20 and count <= 83:
      feature_group_64["tnc"] = feature_group_64["tnc"] + 1
    elif count >= 84 and count <= 231:
      feature_group_64["enac"] = feature_group_64["enac"] + 1
    elif count >= 232 and count <= 297:
      feature_group_64["pseknc"] = feature_group_64["pseknc"] + 1
    elif count >= 298 and count <= 315:
      feature_group_64["pcpsednc"] = feature_group_64["pcpsednc"] + 1
  count = count + 1

print(feature_group_64)
sorted_feature_group_64 = sorted(feature_group_64.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_64)


{'nac': 1, 'dnc': 7, 'tnc': 21, 'enac': 12, 'pseknc': 16, 'pcpsednc': 7}
[('tnc', 21), ('pseknc', 16), ('enac', 12), ('dnc', 7), ('pcpsednc', 7), ('nac', 1)]


In [None]:
#32 feature 
X_32_best= SelectKBest(chi2, k=32).fit(x_train, y_train)
mask = X_32_best.get_support()

count = 0
for x in mask:
  if x:
    if count >= 0 and count <= 3:
      feature_group_32["nac"] = feature_group_32["nac"] + 1
    elif count >= 4 and count <= 19:
      feature_group_32["dnc"] = feature_group_32["dnc"] + 1
    elif count >= 20 and count <= 83:
      feature_group_32["tnc"] = feature_group_32["tnc"] + 1
    elif count >= 84 and count <= 231:
      feature_group_32["enac"] = feature_group_32["enac"] + 1
    elif count >= 232 and count <= 297:
      feature_group_32["pseknc"] = feature_group_32["pseknc"] + 1
    elif count >= 298 and count <= 315:
      feature_group_32["pcpsednc"] = feature_group_32["pcpsednc"] + 1
  count = count + 1

print(feature_group_32)
sorted_feature_group_32 = sorted(feature_group_32.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_32)


{'nac': 0, 'dnc': 4, 'tnc': 8, 'enac': 9, 'pseknc': 7, 'pcpsednc': 4}
[('enac', 9), ('tnc', 8), ('pseknc', 7), ('dnc', 4), ('pcpsednc', 4), ('nac', 0)]


In [None]:
#part 2
x_train = np.concatenate((pseknc_x, pcpsednc_x), axis = 1)
print(x_train.shape)
print(y_train.shape)

(10578, 84)
(10578,)


In [None]:
feature_group_28 = {
    "pseknc": 0,
    "pcpsednc": 0
}
feature_group_16 = {
    "pseknc": 0,
    "pcpsednc": 0
}
feature_group_12 = {
    "pseknc": 0,
    "pcpsednc": 0
}

In [None]:
#28 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 28, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 65:
      feature_group_28["pseknc"] = feature_group_28["pseknc"] + 1
    elif count >= 66 and count <= 83:
      feature_group_28["pcpsednc"] = feature_group_28["pcpsednc"] + 1
  count = count + 1

print(feature_group_28)
sorted_feature_group_28 = sorted(feature_group_28.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_28)

{'pseknc': 19, 'pcpsednc': 9}
[('pseknc', 19), ('pcpsednc', 9)]


In [None]:
#16 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 16, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 65:
      feature_group_16["pseknc"] = feature_group_16["pseknc"] + 1
    elif count >= 66 and count <= 83:
      feature_group_16["pcpsednc"] = feature_group_16["pcpsednc"] + 1
  count = count + 1

print(feature_group_16)
sorted_feature_group_16 = sorted(feature_group_16.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_16)

{'pseknc': 11, 'pcpsednc': 5}
[('pseknc', 11), ('pcpsednc', 5)]


In [None]:
#12 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 12, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 65:
      feature_group_12["pseknc"] = feature_group_12["pseknc"] + 1
    elif count >= 66 and count <= 83:
      feature_group_12["pcpsednc"] = feature_group_12["pcpsednc"] + 1
  count = count + 1

print(feature_group_12)
sorted_feature_group_12 = sorted(feature_group_12.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_12)

{'pseknc': 8, 'pcpsednc': 4}
[('pseknc', 8), ('pcpsednc', 4)]


In [None]:
#SVM cross validation

x_train = pcpsednc_x
x_test = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/PCPseDNC.npy", "/content/drive/My Drive/npy File All/negative 5289/PCPseDNC.npy")
y_test = np.concatenate((np.tile(1, 5289), np.tile(1, 5289)))

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


(10578, 18)
(10578,)
(10578, 18)
(10578,)


In [None]:
seed = 40
np.random.seed(seed)
#elm
kf = KFold(n_splits=5, random_state=seed, shuffle=True)

In [None]:
test_avg_list = []

for c in range(1,100,10):
  test_avg = 0

  print(f'for c = {c}:')

  clf = SVC(C=c,  random_state = seed)
  val_accuracy_list = []
  train_accuracy_list = []
  test_accuracy_list = []

  for train_index, val_index in kf.split(x_train):

    X_train, X_val = x_train[train_index], x_train[val_index]
    Y_train, Y_val = y_train[train_index], y_train[val_index]
    
    clf.fit(X_train, Y_train)

    y_val_pred = clf.predict(X_val)
    val_accuracy = accuracy_score(Y_val, y_val_pred)
    val_accuracy_list.append(val_accuracy)

    y_train_pred = clf.predict(X_train)
    train_accuracy = accuracy_score(Y_train, y_train_pred)
    train_accuracy_list.append(train_accuracy)

    y_test_pred = clf.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_avg = test_avg + test_accuracy
    test_accuracy_list.append(test_accuracy)

  test_avg = test_avg/(len(test_accuracy_list))
  test_avg_list.append(test_avg)

  for i in range(0, len(train_accuracy_list)):
    print("train_accuracy : ", train_accuracy_list[i], "\tVal accuracy : ", val_accuracy_list[i], "\tTest accuracy : ", test_accuracy_list[i])
  print(f'Avg test accuracy for that c is = {test_avg}')
  print()

for c = 1:
train_accuracy :  0.9671472465138266 	Val accuracy :  0.9603024574669187 	Test accuracy :  0.5005672149744753
train_accuracy :  0.966438194280312 	Val accuracy :  0.9593572778827977 	Test accuracy :  0.5030251465305351
train_accuracy :  0.9690380524698653 	Val accuracy :  0.9551039697542533 	Test accuracy :  0.5017961807525052
train_accuracy :  0.967741935483871 	Val accuracy :  0.957919621749409 	Test accuracy :  0.5013235016071091
train_accuracy :  0.9702233250620348 	Val accuracy :  0.9460992907801419 	Test accuracy :  0.5028360748723766
Avg test accuracy for that c is = 0.5019096237474003

for c = 11:
train_accuracy :  0.9884188135192626 	Val accuracy :  0.9603024574669187 	Test accuracy :  0.5009453582907922
train_accuracy :  0.9867643583077287 	Val accuracy :  0.9669187145557656 	Test accuracy :  0.501701644923426
train_accuracy :  0.9884188135192626 	Val accuracy :  0.9579395085066162 	Test accuracy :  0.5014180374361883
train_accuracy :  0.9878293749261491 	Val accur