<a href="https://colab.research.google.com/github/TheorChemGroup/Glucose_Biosensor/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries import

In [None]:
!pip install pycaret

In [3]:
# Data vizualization
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.io as pio

pio.templates.default = "plotly_white"

In [4]:
# Base libraries
import pandas as pd
import pickle
import numpy as np
from typing import List, Tuple
import warnings
warnings.filterwarnings("ignore")

In [5]:
# ML libraries
import xgboost as xgb
import lightgbm as lgb
from pycaret.classification import ClassificationExperiment
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

# Data preparation



In [6]:
# Loading data exported from excel
with open("experiments_mypars.pickle", "rb") as f:
    data = pickle.load(f)

In [7]:
# Unpacking data
tmp = list(map(lambda x: np.array(x), [
    *data[0],
    *data[1],
    *data[2],
    *data[3],
]))

labels = np.concatenate([
    np.zeros(len(data[0])),
    np.ones(len(data[1])),
    np.ones(len(data[2])) * 2,
    np.ones(len(data[3])) * 3,
])

In [8]:
# Polynomial fit function
def extract_features(x):
    p20 = np.poly1d(np.polyfit(range(x.shape[0]), x.reshape(-1), 20))
    return np.array(p20(np.linspace(0, x.shape[0], 30)))

In [9]:
X = []
y = []
curve_max = 0  # We need to find max curve value to normalize all curves from 0 to 1

durations = [] # Experiment duration
min_ = []      # Minimum experiment value
max_ = []      # Maximum experiment value

for i in range(len(tmp)):
    # Get polynomial fitted curve
    v = extract_features(tmp[i])

    curve_max = max(curve_max, v.max())

    durations.append(tmp[i].shape[0] / 10)
    min_.append(v.min())
    max_.append(v.max())

    # Offsetting curve start point to 0
    v += abs(v[0])

    X.append(v)
    y.append(labels[i])

X /= curve_max # Curve normalization

x = []
for i in range(len(X)):
    x.append(np.append(X[i], [
        min_[i],
        max_[i],
        durations[i],
        np.mean(X[i]),
        np.std(X[i]),
        np.quantile(X[i], .01),
        np.quantile(X[i], .99)
    ]))

X = np.stack(x)
y = np.array(y)

In [10]:
fig = make_subplots(rows=2, cols=2, subplot_titles=["Glucose", "Glutathione", "Ascorbic acid", "Cysteine"])

prev = 0
next = len(data[0])

for exp in X[prev:prev + next]:
    fig.add_trace(
        go.Scatter(x=list(range(31)), y=exp[:30], showlegend=False), row=1, col=1
    )

prev += len(data[0])
next = len(data[1])

for exp in X[prev:prev + next]:
    fig.add_trace(
        go.Scatter(x=list(range(31)), y=exp[:30], showlegend=False), row=1, col=2
    )

prev += len(data[1])
next = len(data[2])

for exp in X[prev:prev + next]:
    fig.add_trace(
        go.Scatter(x=list(range(31)), y=exp[:30], showlegend=False), row=2, col=1
    )

prev += len(data[2])
next = len(data[3])

for exp in X[prev:prev + next]:
    fig.add_trace(
        go.Scatter(x=list(range(31)), y=exp[:30], showlegend=False), row=2, col=2
    )

fig.show()

In [11]:
df = pd.DataFrame(X)
df["target"] = y
df.columns = list(map(str, range(30))) + ["min", "max", "duration", "mean", "std", "q1", "q2"] + ["target"]

In [12]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,min,max,duration,mean,std,q1,q2,target
0,0.0,5.8e-05,7.5e-05,0.000131,0.000352,0.00075,0.001225,0.001619,0.001899,0.002214,...,0.006666,0.0067,-8e-06,-7e-06,91.0,0.003888,0.002399,1.7e-05,0.00669,0.0
1,0.0,0.000204,0.00022,0.000686,0.00128,0.001644,0.001782,0.001875,0.001992,0.00206,...,0.004351,0.004115,-8e-06,-7e-06,131.0,0.002764,0.001364,5.9e-05,0.004333,0.0
2,0.0,0.000103,0.000157,0.00062,0.001016,0.001025,0.00098,0.001101,0.00125,0.001258,...,0.007165,0.007126,-9e-06,-8e-06,100.0,0.003876,0.002886,3e-05,0.007221,0.0
3,0.0,-0.000209,-0.000368,-0.000215,5e-05,1.6e-05,-2.8e-05,0.000345,0.00122,0.002369,...,0.008246,0.007715,-9e-06,-8e-06,48.1,0.004076,0.003101,-0.000323,0.008726,0.0
4,0.0,0.00021,0.000242,0.000706,0.001223,0.001786,0.002347,0.002794,0.003097,0.003314,...,0.00599,0.006178,-9e-06,-8e-06,110.7,0.00395,0.00195,6.1e-05,0.006136,0.0


In [13]:
# Scaling data
scaler = MinMaxScaler()
df_norm = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df["duration"] = df_norm["duration"]

# ML models training

In [14]:
# Running AutoML pipeline to find best model
s = ClassificationExperiment()
s.setup(df, target="target", session_id=12345)
s.add_metric("balanced_accuracy", "Balanced Accuracy", balanced_accuracy_score, greater_is_better=True)

best = s.compare_models(cross_validation=True)

Unnamed: 0,Description,Value
0,Session id,12345
1,Target,target
2,Target type,Multiclass
3,Original data shape,"(462, 38)"
4,Transformed data shape,"(462, 38)"
5,Transformed train set shape,"(323, 38)"
6,Transformed test set shape,"(139, 38)"
7,Numeric features,37
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,Balanced Accuracy,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.8543,0.9592,0.8543,0.8621,0.8524,0.8053,0.8091,0.8529,1.12
xgboost,Extreme Gradient Boosting,0.8421,0.9519,0.8421,0.8624,0.8399,0.7894,0.7962,0.8416,0.404
gbc,Gradient Boosting Classifier,0.8363,0.959,0.8363,0.8493,0.8335,0.7813,0.7865,0.8341,1.47
rf,Random Forest Classifier,0.8143,0.9459,0.8143,0.8393,0.8088,0.7518,0.7607,0.811,0.628
et,Extra Trees Classifier,0.8112,0.9511,0.8112,0.8326,0.8057,0.7475,0.7553,0.8082,0.199
dt,Decision Tree Classifier,0.7552,0.8366,0.7552,0.764,0.7546,0.6733,0.6764,0.7547,0.139
knn,K Neighbors Classifier,0.6286,0.8535,0.6286,0.6509,0.6287,0.5044,0.5098,0.6273,0.082
lda,Linear Discriminant Analysis,0.6198,0.8013,0.6198,0.6502,0.6133,0.4921,0.5016,0.618,0.051
lr,Logistic Regression,0.5794,0.7736,0.5794,0.5819,0.5369,0.4387,0.4643,0.5772,1.432
ridge,Ridge Classifier,0.5419,0.0,0.5419,0.45,0.4677,0.3878,0.4269,0.5372,0.06


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(["target"], axis=1), df["target"], test_size=.25, shuffle=True, random_state=12345)

In [16]:
lgbm = lgb.LGBMClassifier(
    application="multiclass",
    metric="multi_logloss",
    importance_type="split",
    min_child_samples=20,
    min_child_weight=0.001,
    min_split_gain=0.0,
    max_depth=-1,
    n_estimators=100,
    subsample=1.0,
    subsample_for_bin=200000,
    objective=None,
    n_jobs=-1,
    random_state=12345,
    verbose=-1
)
lgbm.fit(X_train, y_train)
predictions = lgbm.predict(X_test)

In [17]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(x=list(range(len(predictions))), y=y_test.reset_index(drop=True), mode="lines", name="target")
)
fig.add_trace(
    go.Scatter(x=list(range(len(predictions))), y=predictions, mode="lines", name="predictions", line=dict(dash="dash"))
)

fig.update_layout(
    title_text="Target and predictions",
    title_x=.5,
    xaxis=dict(
        title_text="Samples",
    ),
    yaxis=dict(
        title_text="Classes",
        tickvals=[0, 1, 2, 3],
        ticktext=["Glucose", "Glutathione", "Ascorbic acid", "Cysteine"],
    )
)

fig.show()

In [18]:
acc = accuracy_score(y_test, predictions)
bal_acc = balanced_accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions, average="macro")
print(f"""Final metrics:

Accuracy = {acc}
Balanced Accuracy = {bal_acc}
F1 Score = {f1}
""")

Final metrics:

Accuracy = 0.8189655172413793
Balanced Accuracy = 0.8303327720666431
F1 Score = 0.8241128824686447

