### This notebook classifies the samples based on their spectral peaks.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ast import literal_eval


A function for examining model metrics

In [2]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)


Read in the dataset.

In [3]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

In [4]:
df = pd.read_csv("../../data/scaled_and_noise_removal.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,SpecID,Seq,WaveNumber,SurID,Status,Absorbance
0,0,201210-1-00,0,200.0,201210-1,Normal,1.0
1,1,201210-1-00,1,200.68336,201210-1,Normal,0.998435
2,2,201210-1-00,2,201.36674,201210-1,Normal,0.99688
3,3,201210-1-00,3,202.05011,201210-1,Normal,0.995333
4,4,201210-1-00,4,202.73349,201210-1,Normal,0.993794


Pivot the Dataframe to create a column for each wavelength

In [6]:
pivoted_df = df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
pivoted_df.columns.name = None

Add the statuses back.

In [7]:
statuses = df[['SpecID', 'Status']].drop_duplicates()
pivoted_df = pd.merge(pivoted_df, statuses, on='SpecID')
pivoted_df = pivoted_df.set_index('SpecID')

In [8]:
pivoted_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1.0,0.998435,0.99688,0.995333,0.993794,0.992259,0.990735,0.989228,0.987741,0.986277,...,0.40445,0.404232,0.404023,0.403821,0.403623,0.403425,0.403231,0.40304,0.402852,Normal
201210-1-01,1.0,0.997784,0.995571,0.993362,0.991156,0.988951,0.986752,0.984558,0.982372,0.980197,...,0.423986,0.42421,0.424442,0.42468,0.424923,0.425165,0.425411,0.425659,0.425911,Normal
201210-1-02,1.0,0.997002,0.994016,0.991044,0.988084,0.985128,0.982189,0.979273,0.976385,0.973533,...,0.568881,0.570079,0.571272,0.572461,0.573646,0.574831,0.576013,0.577192,0.578369,Normal
201210-1-03,0.741209,0.758076,0.774771,0.791293,0.807643,0.823983,0.840069,0.85579,0.871042,0.885727,...,0.195789,0.195683,0.195578,0.195474,0.19537,0.195266,0.195161,0.195057,0.194953,Normal
201210-1-04,1.0,0.998114,0.996231,0.99435,0.99247,0.990592,0.988716,0.986844,0.984978,0.983121,...,0.496956,0.496816,0.496688,0.496571,0.49646,0.496351,0.496247,0.496147,0.496054,Normal


##### PyCaret

In [9]:
import pandas as pd
from pycaret.classification import *

Set up the PyCaret environment

In [10]:
caret = setup(data=pivoted_df, target='Status', session_id=1234)

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Status
2,Target type,Multiclass
3,Target mapping,"Hyperglycemia: 0, Hypoglycemia: 1, Normal: 2"
4,Original data shape,"(3045, 2636)"
5,Transformed data shape,"(3045, 2636)"
6,Transformed train set shape,"(2131, 2636)"
7,Transformed test set shape,"(914, 2636)"
8,Numeric features,2635
9,Preprocess,True


Compare models

In [11]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9409,0.9909,0.9409,0.9416,0.9409,0.9112,0.9115,0.373
rf,Random Forest Classifier,0.9217,0.9861,0.9217,0.9225,0.9217,0.8822,0.8826,0.805
knn,K Neighbors Classifier,0.8761,0.9651,0.8761,0.8808,0.8763,0.814,0.8159,0.223
gbc,Gradient Boosting Classifier,0.8508,0.9582,0.8508,0.852,0.8507,0.7757,0.7764,56.541
dt,Decision Tree Classifier,0.8104,0.8576,0.8104,0.8109,0.81,0.7148,0.7154,0.807
lda,Linear Discriminant Analysis,0.6969,0.8341,0.6969,0.7007,0.6969,0.544,0.5455,4.747
ada,Ada Boost Classifier,0.6922,0.8297,0.6922,0.6947,0.6908,0.5373,0.5393,3.897
lr,Logistic Regression,0.6678,0.8321,0.6678,0.6695,0.6652,0.4987,0.5011,9.839
ridge,Ridge Classifier,0.6617,0.0,0.6617,0.6662,0.658,0.4886,0.4924,0.156
svm,SVM - Linear Kernel,0.5242,0.0,0.5242,0.6532,0.4742,0.2841,0.3428,0.691


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Create the model

In [None]:
model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8505,0.9549,0.8505,0.8511,0.8501,0.7756,0.7762
1,0.8404,0.9458,0.8404,0.8415,0.8402,0.7602,0.7609
2,0.7887,0.9239,0.7887,0.7913,0.7889,0.683,0.684
3,0.892,0.9643,0.892,0.8922,0.8917,0.8374,0.8377
4,0.8404,0.9509,0.8404,0.8397,0.8398,0.7597,0.7599
5,0.8357,0.9588,0.8357,0.8408,0.8343,0.7521,0.7559
6,0.7981,0.9396,0.7981,0.8035,0.7993,0.6971,0.6984
7,0.8263,0.9498,0.8263,0.8286,0.8271,0.7392,0.7395
8,0.7887,0.9286,0.7887,0.7891,0.7884,0.6826,0.683
9,0.784,0.9252,0.784,0.7913,0.784,0.6762,0.6792


In [None]:
tuned_model = tune_model(model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8224,0.9358,0.8224,0.8346,0.8246,0.7343,0.7379
1,0.7934,0.9137,0.7934,0.7947,0.7935,0.6895,0.69
2,0.7418,0.9089,0.7418,0.753,0.7421,0.6141,0.6187
3,0.7981,0.9281,0.7981,0.8042,0.7988,0.6973,0.6993
4,0.7934,0.9275,0.7934,0.7986,0.7949,0.6902,0.6911
5,0.7559,0.9339,0.7559,0.7645,0.7567,0.6337,0.6364
6,0.77,0.9212,0.77,0.7795,0.77,0.6549,0.6586
7,0.7887,0.9373,0.7887,0.808,0.7924,0.6838,0.6886
8,0.784,0.9076,0.784,0.789,0.7842,0.6765,0.6786
9,0.6995,0.8987,0.6995,0.7238,0.6993,0.5506,0.5591


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
evaluate_model(tuned_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…