In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
from feature_extraction.CTFeatureExtraction import CTFeatureExtraction
from feature_extraction.QSOFeatureExtraction import QSOFeatureExtraction
from feature_extraction.GTPCFeatureExtraction import GTPCFeatureExtraction
from feature_extraction.GDPCFeatureExtraction import GDPCFeatureExtraction
from feature_extraction.CTDFeatureExtraction import CTDFeatureExtraction
from feature_extraction.CKSAAPFeatureExtraction import CKSAAPFeatureExtraction
from feature_extraction.AAIFeatureExtraction import AAIFeatureExtraction
from feature_extraction.DDEFeatureExtraction import DDEFeatureExtraction
from feature_extraction.DPCFeatureExtraction import DPCFeatureExtraction
from feature_extraction.KAACFeatureExtraction import KAACFeatureExtraction

# Load the dataset
data = pd.read_excel('../data/Final_2Sm_modified_with_sequences.xlsx')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the folding_type column and transform it to numeric labels
data['folding_type'] = label_encoder.fit_transform(data['folding_type'])

# Extract labels for model training
labels = data['folding_type'].values

# Initialize the feature extraction objects
kaac_extractor = KAACFeatureExtraction()
dpc_extractor = DPCFeatureExtraction()
dde_extractor = DDEFeatureExtraction()
aai_extractor = AAIFeatureExtraction()
cksaap_extractor = CKSAAPFeatureExtraction()
ctd_extractor = CTDFeatureExtraction()
gdpc_extractor = GDPCFeatureExtraction()
gtpc_extractor = GTPCFeatureExtraction()
qso_extractor = QSOFeatureExtraction()
ct_extractor = CTFeatureExtraction()

# Extract features using feature extracting methods
kaac_features = np.array([kaac_extractor.calculate_kaac_features(seq) for seq in data['sequence']])
dpc_features = np.array([dpc_extractor.calculate_dpc_features(seq) for seq in data['sequence']])
dde_features = np.array([dde_extractor.calculate_dde_features(seq) for seq in data['sequence']])
aai_features = np.array([aai_extractor.calculate_aai_features(seq) for seq in data['sequence']])
cksaap_features = np.array([cksaap_extractor.calculate_cksaap_features(seq) for seq in data['sequence']])
ctd_features = np.array([ctd_extractor.calculate_ctd_features(seq) for seq in data['sequence']])
gdpc_features = np.array([gdpc_extractor.calculate_gdpc_features(seq) for seq in data['sequence']])
gtpc_features = np.array([gtpc_extractor.calculate_gtpc_features(seq) for seq in data['sequence']])
qso_features = np.array([qso_extractor.calculate_qso_features(seq) for seq in data['sequence']])
ct_features = np.array([ct_extractor.calculate_ct_features(seq) for seq in data['sequence']])

# Combine the extracted features
combined_features = np.concatenate((kaac_features, dpc_features, dde_features), axis=1)

In [3]:
combined_features.shape

(141, 821)

In [6]:
from sklearn.model_selection import LeaveOneOut

# SVM with Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(combined_features):
    X_train, X_test = combined_features[train_index], combined_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # Scale the features for the current split
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = SVC(kernel='linear')
    clf.fit(X_train_scaled, y_train)
    y_pred.append(clf.predict(X_test_scaled)[0])
    y_true.append(y_test[0])

In [7]:
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[70 19]
 [25 27]]

Accuracy (ACC): 0.69
Matthews Correlation Coefficient (MCC): 0.31

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76        89
           1       0.59      0.52      0.55        52

    accuracy                           0.69       141
   macro avg       0.66      0.65      0.66       141
weighted avg       0.68      0.69      0.68       141



# Kernel SVM with hyperparameter tuning & validate with LOOCV

In [4]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Define the parameter grid for grid search
param_grid = {
    'C': [2**i for i in range(-15, 16, 2)],
    'gamma': [2**i for i in range(-15, 4, 2)],
    'kernel': ['rbf']
}

# Create an SVM classifier
clf = SVC()

# Perform grid search with leave-one-out cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2)
grid_search.fit(scaled_features, labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)

# Get the best classifier
best_clf = grid_search.best_estimator_

# Perform leave-one-out cross-validation with the best classifier
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(scaled_features):
    X_train, X_test = scaled_features[train_index], scaled_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_clf.fit(X_train, y_train)
    y_pred.append(best_clf.predict(X_test)[0])
    y_true.append(y_test[0])

Fitting 141 folds for each of 160 candidates, totalling 22560 fits
Best parameters:  {'C': 8, 'gamma': 0.00048828125, 'kernel': 'rbf'}
Best score:  0.7375886524822695


In [6]:
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[75 14]
 [23 29]]

Accuracy (ACC): 0.74
Matthews Correlation Coefficient (MCC): 0.42

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.84      0.80        89
           1       0.67      0.56      0.61        52

    accuracy                           0.74       141
   macro avg       0.72      0.70      0.71       141
weighted avg       0.73      0.74      0.73       141



# RandomForest RF with hyperparameter tuning & validate with LOOCV

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': np.arange(50, 1001, 10),  # mtree: 50 to 1000 with step size of 10
    'max_features': np.arange(1, 16, 1),      # ntry: 1 to 15 with step size of 1
    'min_samples_split': np.arange(2, 11, 1)  # msplit: 2 to 10 with step size of 1
}

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform grid search with leave-one-out cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(scaled_features, labels)

# Get the best classifier
best_rf_clf = grid_search.best_estimator_

# Perform leave-one-out cross-validation with the best classifier
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(scaled_features):
    X_train, X_test = scaled_features[train_index], scaled_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_rf_clf.fit(X_train, y_train)
    y_pred.append(best_rf_clf.predict(X_test)[0])
    y_true.append(y_test[0])

Fitting 3 folds for each of 12960 candidates, totalling 38880 fits


In [5]:
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[83  6]
 [21 31]]

Accuracy (ACC): 0.81
Matthews Correlation Coefficient (MCC): 0.58

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86        89
           1       0.84      0.60      0.70        52

    accuracy                           0.81       141
   macro avg       0.82      0.76      0.78       141
weighted avg       0.81      0.81      0.80       141

