### This notebook classifies the samples based on their spectral peaks.

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from ast import literal_eval


A function for examining model metrics

In [2]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)


Read in the dataset.

In [3]:
df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

In [4]:
df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
0,201210-1-00,0,200.0,2709.3699,201210-1,Normal
1,201210-1-00,1,200.68336,2697.1318,201210-1,Normal
2,201210-1-00,2,201.36674,2696.0413,201210-1,Normal
3,201210-1-00,3,202.05011,2678.5925,201210-1,Normal
4,201210-1-00,4,202.73349,2670.8928,201210-1,Normal


Pivot the Dataframe to create a column for each wavelength

In [5]:
pivoted_df = df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
pivoted_df.columns.name = None

Add the statuses back.

In [6]:
statuses = df[['SpecID', 'Status']].drop_duplicates()
pivoted_df = pd.merge(pivoted_df, statuses, on='SpecID')
pivoted_df = pivoted_df.set_index('SpecID')

In [7]:
pivoted_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


##### PyCaret

In [8]:
import pandas as pd
from pycaret.classification import *

Set up the PyCaret environment

In [9]:
caret = setup(data=pivoted_df, target='Status', session_id=1234)

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,Status
2,Target type,Multiclass
3,Target mapping,"Hyperglycemia: 0, Hypoglycemia: 1, Normal: 2"
4,Original data shape,"(3045, 2636)"
5,Transformed data shape,"(3045, 2636)"
6,Transformed train set shape,"(2131, 2636)"
7,Transformed test set shape,"(914, 2636)"
8,Numeric features,2635
9,Preprocess,True


Compare models

In [10]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.848,0.9571,0.848,0.8495,0.848,0.7715,0.7722,0.366
rf,Random Forest Classifier,0.8245,0.9442,0.8245,0.8269,0.8244,0.7363,0.7375,0.856
xgboost,Extreme Gradient Boosting,0.8212,0.9387,0.8212,0.8225,0.821,0.7312,0.732,16.21
lightgbm,Light Gradient Boosting Machine,0.8151,0.9385,0.8151,0.8173,0.815,0.722,0.7231,20.145
knn,K Neighbors Classifier,0.8118,0.9376,0.8118,0.8135,0.8118,0.7174,0.7181,0.259
gbc,Gradient Boosting Classifier,0.764,0.9062,0.764,0.7678,0.7638,0.6451,0.6467,59.605
dt,Decision Tree Classifier,0.7297,0.7973,0.7297,0.732,0.7295,0.5937,0.5948,0.543
lr,Logistic Regression,0.71,0.8303,0.71,0.7114,0.7085,0.5631,0.5647,6.369
ridge,Ridge Classifier,0.6133,0.0,0.6133,0.643,0.599,0.4121,0.4325,0.783
ada,Ada Boost Classifier,0.6101,0.7913,0.6101,0.6137,0.6093,0.4146,0.4162,4.137


Create the model

In [11]:
model = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8505,0.9549,0.8505,0.8511,0.8501,0.7756,0.7762
1,0.8404,0.9458,0.8404,0.8415,0.8402,0.7602,0.7609
2,0.7887,0.9239,0.7887,0.7913,0.7889,0.683,0.684
3,0.892,0.9643,0.892,0.8922,0.8917,0.8374,0.8377
4,0.8404,0.9509,0.8404,0.8397,0.8398,0.7597,0.7599
5,0.8357,0.9588,0.8357,0.8408,0.8343,0.7521,0.7559
6,0.7981,0.9396,0.7981,0.8035,0.7993,0.6971,0.6984
7,0.8263,0.9498,0.8263,0.8286,0.8271,0.7392,0.7395
8,0.7887,0.9286,0.7887,0.7891,0.7884,0.6826,0.683
9,0.784,0.9252,0.784,0.7913,0.784,0.6762,0.6792


In [12]:
tuned_model = tune_model(model)

Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [None]:
evaluate_model(tuned_model)