In [15]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
def datasetLoad(posFile, negFile):
  x_train = np.concatenate((np.load(posFile), np.load(negFile)), axis = 0)

  return x_train


In [18]:
#location
kmer_nac_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/Kmer 1.npy","/content/drive/My Drive/npy File All/negative 5289/Kmer 1.npy")
kmer_dnc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/Kmer 2.npy","/content/drive/My Drive/npy File All/negative 5289/Kmer 2.npy")
kmer_tnc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/Kmer 3.npy","/content/drive/My Drive/npy File All/negative 5289/Kmer 3.npy")
enac_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/ENAC 5.npy","/content/drive/My Drive/npy File All/negative 5289/ENAC 5.npy")
pseknc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/PseKNC.npy","/content/drive/My Drive/npy File All/negative 5289/PseKNC.npy") 
pcpsednc_x = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/PCPseDNC.npy","/content/drive/My Drive/npy File All/negative 5289/PCPseDNC.npy") 

y_train = np.concatenate((np.tile(1, 5289), np.tile(0, 5289)))

In [19]:
print(kmer_nac_x.shape)
print(kmer_dnc_x.shape)
print(kmer_tnc_x.shape)
print(enac_x.shape)
print(pseknc_x.shape)
print(pcpsednc_x.shape)
print(y_train.shape)

(10578, 4)
(10578, 16)
(10578, 64)
(10578, 148)
(10578, 66)
(10578, 18)
(10578,)


In [20]:
x_train = np.concatenate((kmer_nac_x, kmer_dnc_x), axis = 1)
x_train = np.concatenate((x_train, kmer_tnc_x), axis = 1)
x_train = np.concatenate((x_train, enac_x), axis = 1)
x_train = np.concatenate((x_train, pseknc_x), axis = 1)
x_train = np.concatenate((x_train, pcpsednc_x), axis = 1)

print(x_train.shape)

(10578, 316)


In [21]:
feature_group_64 = {
    "nac": 0,
    "dnc": 0,
    "tnc": 0,
    "enac": 0,
    "pseknc": 0,
    "pcpsednc": 0
}
feature_group_32 = {
    "nac": 0,
    "dnc": 0,
    "tnc": 0,
    "enac": 0,
    "pseknc": 0,
    "pcpsednc": 0
}

In [22]:
#64 feature
X_64_best= SelectKBest(chi2, k=64).fit(x_train, y_train)
mask = X_64_best.get_support()

count = 0
for x in mask:
  if x:
    if count >= 0 and count <= 3:
      feature_group_64["nac"] = feature_group_64["nac"] + 1
    elif count >= 4 and count <= 19:
      feature_group_64["dnc"] = feature_group_64["dnc"] + 1
    elif count >= 20 and count <= 83:
      feature_group_64["tnc"] = feature_group_64["tnc"] + 1
    elif count >= 84 and count <= 231:
      feature_group_64["enac"] = feature_group_64["enac"] + 1
    elif count >= 232 and count <= 297:
      feature_group_64["pseknc"] = feature_group_64["pseknc"] + 1
    elif count >= 298 and count <= 315:
      feature_group_64["pcpsednc"] = feature_group_64["pcpsednc"] + 1
  count = count + 1


sorted_feature_group_64 = sorted(feature_group_64.items(), key=lambda x: x[1], reverse=True)

for i in sorted_feature_group_64:
  print(i)


('tnc', 21)
('pseknc', 16)
('enac', 12)
('dnc', 7)
('pcpsednc', 7)
('nac', 1)


In [23]:
#32 feature 
X_32_best= SelectKBest(chi2, k=32).fit(x_train, y_train)
mask = X_32_best.get_support()

count = 0
for x in mask:
  if x:
    if count >= 0 and count <= 3:
      feature_group_32["nac"] = feature_group_32["nac"] + 1
    elif count >= 4 and count <= 19:
      feature_group_32["dnc"] = feature_group_32["dnc"] + 1
    elif count >= 20 and count <= 83:
      feature_group_32["tnc"] = feature_group_32["tnc"] + 1
    elif count >= 84 and count <= 231:
      feature_group_32["enac"] = feature_group_32["enac"] + 1
    elif count >= 232 and count <= 297:
      feature_group_32["pseknc"] = feature_group_32["pseknc"] + 1
    elif count >= 298 and count <= 315:
      feature_group_32["pcpsednc"] = feature_group_32["pcpsednc"] + 1
  count = count + 1

#print(feature_group_32)
sorted_feature_group_32 = sorted(feature_group_32.items(), key=lambda x: x[1], reverse=True)
#print(sorted_feature_group_32)
for i in sorted_feature_group_32:
  print(i)


('enac', 9)
('tnc', 8)
('pseknc', 7)
('dnc', 4)
('pcpsednc', 4)
('nac', 0)


In [24]:
#part 2
x_train = np.concatenate((kmer_tnc_x, pseknc_x), axis = 1)
print(x_train.shape)
print(y_train.shape)

(10578, 130)
(10578,)


In [25]:
feature_group_44 = {
    "tnc": 0,
    "pseknc": 0
}
feature_group_16 = {
    "tnc": 0,
    "pseknc": 0
}
feature_group_12 = {
    "tnc": 0,
    "pseknc": 0
}

In [26]:
#28 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 44, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected


arr1 =[]
arr2 =[]
count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 63:
      feature_group_44["tnc"] = feature_group_44["tnc"] + 1
      #print(x)
      arr1.append(count)
    elif count >= 64 and count <= 129:
      feature_group_44["pseknc"] = feature_group_44["pseknc"] + 1
      arr2.append(count)
  count = count + 1

print(feature_group_44)
sorted_feature_group_44 = sorted(feature_group_44.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_44)
#print(arr1,'\n',arr2)

{'tnc': 23, 'pseknc': 21}
[('tnc', 23), ('pseknc', 21)]


In [None]:
#16 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 16, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 65:
      feature_group_16["pseknc"] = feature_group_16["pseknc"] + 1
    elif count >= 66 and count <= 83:
      feature_group_16["pcpsednc"] = feature_group_16["pcpsednc"] + 1
  count = count + 1

print(feature_group_16)
sorted_feature_group_16 = sorted(feature_group_16.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_16)

{'pseknc': 11, 'pcpsednc': 5}
[('pseknc', 11), ('pcpsednc', 5)]


In [None]:
#12 feature 
estimator = RandomForestClassifier(random_state = 42)
selector = RFE(estimator, 12, step=1)
selector = selector.fit(x_train, y_train)
rfe_mask = selector.get_support() #list of booleans for selected

count = 0
for x in rfe_mask:
  if x:
    if count >= 0 and count <= 65:
      feature_group_12["pseknc"] = feature_group_12["pseknc"] + 1
    elif count >= 66 and count <= 83:
      feature_group_12["pcpsednc"] = feature_group_12["pcpsednc"] + 1
  count = count + 1

print(feature_group_12)
sorted_feature_group_12 = sorted(feature_group_12.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_group_12)

{'pseknc': 8, 'pcpsednc': 4}
[('pseknc', 8), ('pcpsednc', 4)]


In [27]:
#SVM cross validation

x_train =pseknc_x
x_test = datasetLoad("/content/drive/My Drive/npy File All/positive 5289/PseKNC.npy", "/content/drive/My Drive/npy File All/negative 5289/PseKNC.npy")
print(x_test)
y_test = np.concatenate((np.tile(1, 5289), np.tile(1, 5289)))

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)


[[0.         0.         0.         ... 0.70128421 0.18145461 0.26031037]
 [0.         0.         0.         ... 0.53658254 0.09725188 0.17105845]
 [0.         0.         0.4865679  ... 0.         0.22981686 0.27482245]
 ...
 [0.         0.52174178 0.11727713 ... 0.         0.29982455 0.25865951]
 [0.         0.16980248 0.22900922 ... 0.         0.39920247 0.21083936]
 [0.3755807  0.18565346 0.25038713 ... 0.11897301 0.21776931 0.18726563]]
(10578, 66)
(10578,)
(10578, 66)
(10578,)


In [28]:
seed = 40
np.random.seed(seed)
#elm
kf = KFold(n_splits=5, random_state=seed, shuffle=True)

In [29]:
test_avg_list = []

for c in range(1,1000,50):
  test_avg = 0

  print(f'for c = {c}:')

  clf = SVC(C=c,  random_state = seed)
  val_accuracy_list = []
  train_accuracy_list = []
  test_accuracy_list = []

  for train_index, val_index in kf.split(x_train):

    X_train, X_val = x_train[train_index], x_train[val_index]
    Y_train, Y_val = y_train[train_index], y_train[val_index]
    
    clf.fit(X_train, Y_train)

    y_val_pred = clf.predict(X_val)
    val_accuracy = accuracy_score(Y_val, y_val_pred)
    val_accuracy_list.append(val_accuracy)

    y_train_pred = clf.predict(X_train)
    train_accuracy = accuracy_score(Y_train, y_train_pred)
    train_accuracy_list.append(train_accuracy)

    y_test_pred = clf.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_avg = test_avg + test_accuracy
    test_accuracy_list.append(test_accuracy)

  test_avg = test_avg/(len(test_accuracy_list))
  test_avg_list.append(test_avg)

  for i in range(0, len(train_accuracy_list)):
    print("train_accuracy : ", train_accuracy_list[i], "\tVal accuracy : ", val_accuracy_list[i], "\tTest accuracy : ", test_accuracy_list[i])
  print(f'Avg test accuracy for that c is = {test_avg}')
  print()

for c = 1:
train_accuracy :  0.9994091231387379 	Val accuracy :  0.9933837429111532 	Test accuracy :  0.4999054641709208
train_accuracy :  0.9991727723942331 	Val accuracy :  0.9924385633270322 	Test accuracy :  0.4999054641709208
train_accuracy :  0.9989364216497282 	Val accuracy :  0.9938563327032136 	Test accuracy :  0.5
train_accuracy :  0.99940919295758 	Val accuracy :  0.9914893617021276 	Test accuracy :  0.4993382491964454
train_accuracy :  0.9989365473236441 	Val accuracy :  0.9914893617021276 	Test accuracy :  0.5002836074872377
Avg test accuracy for that c is = 0.49988655700510504

for c = 51:
train_accuracy :  1.0 	Val accuracy :  0.9933837429111532 	Test accuracy :  0.5
train_accuracy :  1.0 	Val accuracy :  0.995274102079395 	Test accuracy :  0.49962185668368314
train_accuracy :  1.0 	Val accuracy :  0.9938563327032136 	Test accuracy :  0.5000945358290793
train_accuracy :  1.0 	Val accuracy :  0.9929078014184397 	Test accuracy :  0.4993382491964454
train_accuracy :  1.0 	V