# Mounting Google Drive to Colab

This will enable us to read files directly from our Google Drive and also save files generated direcly to our Google Drive

An alternative way is by directly clicking the file icon at the left panel and then clicking on the Drive icon to mount it on colab

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

# Importing Pandas Library

In [None]:
import pandas as pd

# Reading the Extracted Features from Google Drive

In [None]:
## Extracted Features for Cleaved octapeptide
amino_acid_binary_profile_cleaved = pd.read_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Extracted Features/Amino acid Binary Profile (AABP)(cleaved).csv")
bond_composition_cleaved =pd.read_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Extracted Features/Bond Composition (cleaved).csv")
physicochemical_properties_cleaved = pd.read_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Extracted Features/Physico-Chemical Properties (cleaved).csv")

## Extracted Features for Uncleaved octapeptide
amino_acid_binary_profile_uncleaved = pd.read_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Extracted Features/Amino acid Binary Profile (AABP)(uncleaved).csv")
bond_composition_uncleaved =pd.read_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Extracted Features/Bond Composition (uncleaved).csv")
physicochemical_properties_uncleaved = pd.read_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Extracted Features/Physico-Chemical Properties (uncleaved).csv")

# Combining the datasets

In [None]:
## Combining the datasets of cleaved octapeptides into one and relabelling it

cleaved_dataset = pd.concat([amino_acid_binary_profile_cleaved, bond_composition_cleaved, physicochemical_properties_cleaved.drop("PCP_SM", axis=1)], axis=1)
cleaved_dataset

In [None]:
## Generating the labels for cleaved dataset

cleaved_label = pd.Series(["cleaved" for i in range(1001)], name="cleavage Status")

In [None]:
cleaved_dataset_labelled = pd.concat([cleaved_dataset, cleaved_label], axis=1)
cleaved_dataset_labelled

In [None]:
## Combining the datasets of non-cleaved octapeptides into one and relabelling it

uncleaved_dataset = pd.concat([amino_acid_binary_profile_uncleaved, bond_composition_uncleaved, physicochemical_properties_uncleaved], axis=1)
uncleaved_dataset

In [None]:
## Generating the labels for non-cleaved dataset

uncleaved_label = pd.Series(["uncleaved" for i in range(4847)], name="cleavage Status")

In [None]:
uncleaved_dataset_labelled = pd.concat([uncleaved_dataset, uncleaved_label], axis=1)
uncleaved_dataset_labelled

# Combining the cleaved_dataset_labelled and uncleaved_dataset_labelled

In [None]:
combined_extracted_octapeptide_features = pd.concat([cleaved_dataset_labelled, uncleaved_dataset_labelled], axis=0)
combined_extracted_octapeptide_features

# Feature Selection of our combined_extracted_octapeptide_features

In [None]:
## VarianceThreshold ised to remove low variance features from the feature variables
from sklearn.feature_selection import VarianceThreshold

In [None]:
## Assigning the feature variables to X
X = combined_extracted_octapeptide_features.drop(["cleavage Status", "ID"], axis=1)
X

In [None]:
selector = VarianceThreshold()
result = selector.fit_transform(X)
df =pd.DataFrame(result)
df

**Observe that there was no change in the dimension of the dataset indicating that no two features are the same**

# Saving the combined_extracted_octapeptide_features to Drive

In [None]:
combined_extracted_octapeptide_features.to_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/combined_extracted_octapeptide_features.csv")
combined_extracted_octapeptide_features.drop("ID", axis=1).to_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/new_combined_extracted_octapeptide_features.csv")

# Importing the necessary ML agorithms and metrics from scikit-learn

In [None]:
from sklearn.metrics import zero_one_loss
from sklearn.metrics import jaccard_score # equal to accuracy_score for binary classification
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import hamming_loss
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import recall_score

from sklearn.model_selection import train_test_split

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.svm import LinearSVC as LSVM
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.naive_bayes import CategoricalNB as NBC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.neural_network import MLPClassifier as MLPC
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression as LR

# Asigning features to X and cleavage status to y then splitting the dataset into train and test subsets

In [None]:
X = combined_extracted_octapeptide_features.drop(["cleavage Status", "ID"], axis=1)
y = combined_extracted_octapeptide_features["cleavage Status"].copy()

In [None]:
## Encoding the y label
y = y.map({"cleaved": 1, "uncleaved": 0})

In [None]:
## Splitting the dataset into training and testing subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Model Training and Performance Evaluation

In [None]:
## Instantiating the classifier classes
lsvm = LSVM(random_state=0, tol=1e-5)
lda = LDA()
gbc = GBC(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
knn = KNN(n_neighbors=4)
nbc = NBC()
dtc = DTC()
mlpc = MLPC()
p = Perceptron()
lr = LR(random_state=0)

clf_list = [
    (lsvm, "LinearSVC"),
    (lda, "LinearDiscriminantAnalysis"),
    (gbc, "GradientBoostingClassifier"),
    (knn, "KNeighorsClassifier"),
    (nbc, "CategoricalNB"),
    (dtc, "DecisionTreeClassifier"),
    (mlpc, "MLPClassifier"),
    (p, "Perceptron"),       
    (lr, "LogisticRegression"),
]

## Importing the defaultdict of collections library
from collections import defaultdict as dd

scores = dd(list)

for i, (clf, name) in enumerate(clf_list):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    scores["Classifier"].append(name)   

    for metric in [balanced_accuracy_score, matthews_corrcoef]:
        score_name = metric.__name__.replace("_", " ").replace("score", " ").capitalize()
        scores[score_name].append(metric(y_test, y_pred))

    for metric in [precision_score, recall_score, f1_score, roc_auc_score, jaccard_score]:
        score_name = metric.__name__.replace("_", " ").replace("score", " ").capitalize()
        scores[score_name].append(metric(y_test, y_pred, average="micro"))

    score_df = pd.DataFrame(scores).set_index("Classifier")
    score_df.round(decimals=2)

score_df.to_csv("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/Models_Performance_Metrics.csv")
score_df.to_excel("/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/Models_Performance_Metrics.xlsx")
score_df

# Importing the necessary data visualization libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

# Combined Confusion Matrix Plot

In [None]:
classifiers = [lsvm, lda, gbc, knn, nbc, dtc, mlpc, p, lr]
for cls in classifiers:
    cls.fit(X_train, y_train)

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(7.5, 7.5))

for cls, ax in zip(classifiers, axes.flatten()):
    plot_confusion_matrix(cls, 
                          X_test, 
                          y_test, 
                          ax=ax, 
                          cmap='Greys_r',
                          colorbar=False)
    ax.title.set_text(type(cls).__name__)
    ax.grid(False)

plt.tight_layout()  
plt.savefig('/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/New_Combined_Confusion_Matrix.jpg', dpi=1200)
plt.show()

# All in One ROC Curves

In [None]:
plt.figure(figsize=(5, 5))

fig = plot_roc_curve(lsvm, X_test, y_test)
fig = plot_roc_curve(lda, X_test, y_test, ax = fig.ax_) 
fig = plot_roc_curve(gbc, X_test, y_test, ax = fig.ax_) 
fig = plot_roc_curve(knn, X_test, y_test, ax = fig.ax_) 
fig = plot_roc_curve(nbc, X_test, y_test, ax = fig.ax_)
fig = plot_roc_curve(dtc, X_test, y_test, ax = fig.ax_)
fig = plot_roc_curve(mlpc, X_test, y_test, ax = fig.ax_) 
fig = plot_roc_curve(p, X_test, y_test, ax = fig.ax_) 
fig = plot_roc_curve(lr, X_test, y_test, ax = fig.ax_) 

plt.grid(False)
plt.savefig('/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/New_All_In_One_ROC_Curves.jpg', dpi=1200)
plt.show()

# Plot of Feature Importance for the Gradient Boosting Classifier

In [None]:
## Retrieving feature importabnce from the GB_classifier
importance = pd.Series(gbc.feature_importances_, name="Gini")

## Retrieving feature names
feature_names = pd.Series(X.columns, name="Feature")

## Combining the feature names and Gini values into a DataFrame
df = pd.concat([feature_names, importance], axis=1, names=["Feature", "Gini"])
df

In [None]:
## Plot of feature importance
df_sorted = df.sort_values("Gini", ascending=False)[:20]
plt.figure(figsize=(7.5, 7))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="Gini", y="Feature", data=df_sorted, palette="Greys_r")

plt.xlabel("Feature Importance")
plt.grid(False)
plt.savefig('/content/gdrive/MyDrive/Machine Learning/Projects/PMI_and_Azeez/Manuscript and Data/New_Plot_of_feature_importance_for_GB_classifier.jpg', dpi=1200)
plt.show()