<a href="https://colab.research.google.com/github/Shujaat123/ProteinSeq_Sparse_Representation_Classification_ToolBox/blob/main/ACP_SRC_Toolbox_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**A comprehensive toolbox for protein sequence classification**

In [1]:
!pip install wget
import os
from google.colab import drive
drive_path = '/content/drive/MyDrive/ACP_SRC_TOOLBOX/'
drive.mount('/content/drive')
os.chdir(drive_path)
os.getcwd() 

from utilities.utils import *

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File already exist.


In [2]:
# experiment = 'ACP740_LinearPCA_BASIS_PURSUIT'
# experiment = 'ACP740_LinearPCA_ORTHOGONAL_MATCHPURSUIT'
# experiment = 'ACP740_LinearPCA_MATCHING_PURSUIT'

# experiment = 'ACP240_KernelPCA_BASIS_PURSUIT'
# experiment = 'ACP240_KernelPCA_ORTHOGONAL_MATCHPURSUIT'
# experiment = 'ACP240_KernelPCA_MATCHING_PURSUIT'

# experiment = 'ACP240_LinearPCA_BASIS_PURSUIT'
# experiment = 'ACP240_LinearPCA_ORTHOGONAL_MATCHPURSUIT'
# experiment = 'ACP240_LinearPCA_MATCHING_PURSUIT'

experiment = 'ACP344_KernelPCA_MATCHING_PURSUIT'


gaps = [8] # defined (K) in composition of K-spaced Amino-Acid-Pairs (CKSAAP)
num_folds = 10 # number of K-Folds (cross validation folds) -->for ACP740 and ACP240 set it to 5-folds
normalization = True  # {‘l1’, ‘l2’, ‘max’}, default=’l2’ 
oversampling = True   # default method --> KMeansSMOTE
train_stats = False   # generate training statistics

# pc_list=[10,20,30,40,50,60,70,80,90,100,110,150,175,200,225,250,300,350,400,450,500,550,600] #-->for ACP740 
# pc_list=[10,20,30,40,50,60,70,80,90,100,110,150,175,200] #-->for ACP240 
pc_list=[10,20,30,40,50,60,70,80,90,100,110,150,175,200,225,250,300] #-->for ACP344

exp_path  = drive_path + experiment
os.makedirs(exp_path, exist_ok = True)
os.chdir(exp_path)
os.getcwd() 

'/content/drive/MyDrive/ACP_SRC_TOOLBOX/ACP344_KernelPCA_MATCHING_PURSUIT'

In [None]:
if ('ACP740' in experiment):
  dataset = 'acp740'
  [DataX, LabelY] = Convert_Seq2CKSAAP(prepare_feature(dataset=dataset), gap=gaps[0])
elif ('ACP240' in experiment):
  dataset = 'acp240'
  [DataX, LabelY] = Convert_Seq2CKSAAP(prepare_feature(dataset=dataset), gap=gaps[0])
elif ('ACP344' in experiment):
  dataset = 'acp344'
  [DataX, LabelY] = Convert_Seq2CKSAAP(prepare_feature_344(), gap=gaps[0])

  

if ('LinearPCA' in experiment):
  dimension_reduction = 'PCA'
elif ('KernelPCA' in experiment):
  dimension_reduction = 'KPCA'

if ('BASIS_PURSUIT' in experiment):
  solver_method = 'BP'
elif ('ORTHOGONAL_MATCHPURSUIT' in experiment):
  solver_method = 'OP'
elif ('MATCHING_PURSUIT' in experiment):
  solver_method = 'MP'

kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2)
for num_pc  in pc_list:
  print("For {} number of components(features)".format(num_pc))
  classification_stats = []
  time_stats=[]
  cross_fold_ing=-1  

  for train_index, test_index in kf.split(DataX,np.argmax(LabelY,axis=1)):
      cross_fold_ing = cross_fold_ing + 1
      print('Fold # ', cross_fold_ing)

      # Loading dataset
      print('Loading dataset')
      X_train, X_test = DataX[train_index], DataX[test_index]
      y_train, y_test = LabelY[train_index], LabelY[test_index]
      print('num pos train:', sum(y_train[:,0]==1), 'num neg train:', sum(y_train[:,0]==0))
      y_train = y_train[:,0]
      y_test=y_test[:,0]  
      
      ## pre-processing
      if(normalization):
        normalizer = Normalizer().fit(X_train)  
        X_train = normalizer.transform(X_train)
        X_test = normalizer.transform(X_test)
      
      ## Oversampling
      if(oversampling):
        oversampler = KMeansSMOTE(random_state=42)    
        X_train, y_train = oversampler.fit_resample(X_train, y_train)
        print('After Resampling \n','num pos train:', sum(y_train==1), 'num neg train:', sum(y_train==0))

      ## Dimension reduction
      if(dimension_reduction=='PCA'):
        print('Using PCA with ' + str(num_pc) + ' number of PCs')
        transformer =transformer = PCA(n_components=num_pc)
      elif(dimension_reduction=='KPCA'):
        print('Using Kernel-PCA with ' + str(num_pc) + ' number of PCs')
        transformer =transformer = KernelPCA(n_components=num_pc)

      transformer.fit_transform(X_train)
      X_train = transformer.transform(X_train)
      X_test = transformer.transform(X_test)  
      X_train = np.transpose(X_train)
      X_test = np.transpose(X_test)
      print('Using '+solver_method+' solver method')
      y_test_pred,y_test_score,elp_time= Test_SRC(X_train,y_train,X_test,y_test,solver=solver_method,verbose=0, x0=None, ATinvAAT=None, nnz=None, positive=True, tol=1E-4, niter=100, biter=32)

      
      if(train_stats):
        tr_acc, tr_sen, tr_spe, tr_f1, tr_mcc, tr_bacc, tr_yi = Calculate_Stats(y_train, y_train_pred)

      t_acc, t_sen, t_spe, t_f1, t_mcc, t_bacc, t_yi = Calculate_Stats(y_test,y_test_pred)
      # print(t_acc, t_sen, t_spe, t_f1, t_mcc, t_bacc, t_yi, elp_time)

      ###AUC ROC CURVE
      r_auc = roc_auc_score(y_test,y_test_score)
      r_fpr, r_tpr, _ = roc_curve(y_test,y_test_score)
      # plt.plot(r_fpr, r_tpr, linestyle='--', label='ACP_SRC (AUROC = %0.3f)' % r_auc)
      # del model  # deletes the existing model

      classification_stats.append([t_acc, t_sen, t_spe, t_f1, t_mcc, t_bacc, t_yi, r_auc])
      # time_stats.append(elp_time)
      for item in elp_time:
        time_stats.append(item)

  Class_Statistics = np.asarray(classification_stats)
  Time_Statistics= np.asarray(time_stats)

  filename = 'ACP_KSRC_STATS_CKSAAP_GAP' + str(gaps[0]) + '_Solver_' + solver_method + '_PC' + str(num_pc) + '.mat'
  savemat(filename,{'Time_Statistics':Time_Statistics,'Class_Statistics':Class_Statistics})
  print('SAVING... '+ experiment + '/' + filename)


# of ACP samples 138
# of non-ACP samples 206
For 10 number of components(features)
Fold #  0
Loading dataset
num pos train: 185 num neg train: 124
After Resampling 
 num pos train: 185 num neg train: 186
Using Kernel-PCA with 10 number of PCs
Using MP solver method
Fold #  1
Loading dataset
num pos train: 185 num neg train: 124
After Resampling 
 num pos train: 185 num neg train: 186
Using Kernel-PCA with 10 number of PCs
Using MP solver method
Fold #  2
Loading dataset
num pos train: 185 num neg train: 124
After Resampling 
 num pos train: 185 num neg train: 187
Using Kernel-PCA with 10 number of PCs
Using MP solver method
Fold #  3
Loading dataset
num pos train: 185 num neg train: 124
After Resampling 
 num pos train: 185 num neg train: 187
Using Kernel-PCA with 10 number of PCs
Using MP solver method
Fold #  4
Loading dataset
num pos train: 186 num neg train: 124
After Resampling 
 num pos train: 186 num neg train: 188
Using Kernel-PCA with 10 number of PCs
Using MP solver method
F