Import Libraries

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from scipy.signal import find_peaks
from scipy.signal import peak_widths

Read the spectral data

In [2]:
spectra_df = pd.read_csv("../../data/exosomes.raw_spectrum_1.csv")

#### First we will look at the full wavelength.

Create a field for each wavelength.

In [3]:
wavelength_df = spectra_df.pivot(index='SpecID', columns='WaveNumber', values='Absorbance').reset_index()
wavelength_df.columns.name = None

Add the statuses back.

In [4]:
statuses = spectra_df[['SpecID', 'Status']].drop_duplicates()
wavelength_df = pd.merge(wavelength_df, statuses, on='SpecID')
wavelength_df = wavelength_df.set_index('SpecID')

In [5]:
wavelength_df.head()

Unnamed: 0_level_0,200.0,200.68336,201.36674,202.05011,202.73349,203.41685,204.10023,204.7836,205.46696,206.15034,...,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,2709.3699,2697.1318,2696.0413,2678.5925,2670.8928,2652.5435,2646.3245,2690.324,2620.3228,2609.0132,...,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699,Normal
201210-1-01,2979.3169,2985.707,2970.1677,2947.095,2941.8743,2942.4648,2939.9595,2938.4509,2930.9204,2915.7979,...,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922,Normal
201210-1-02,3702.5627,3592.4902,3640.8423,3593.415,3583.656,3583.479,3554.3279,3507.1514,3494.4998,3490.437,...,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126,Normal
201210-1-03,8129.5938,8222.3184,8370.2803,8534.415,8684.1543,8805.7393,8964.5283,9220.3066,9257.7461,9399.7734,...,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048,Normal
201210-1-04,3468.5203,3463.0237,3475.2666,3468.5999,3451.7124,3439.6379,3439.1538,3444.8345,3446.624,3438.2632,...,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722,Normal


##### 1. Training a Random Forest and Extra Trees Classifer on the whole spectrum.

In [6]:
def calculate_metrics(y_test, y_pred):

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy: {overall_accuracy}\n")

    # Calculate precision, recall, and F1-score for each class
    report = classification_report(y_test, y_pred)
    print("\nClassification Report:")
    print(report)

    # Show the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("\nConfusion Matrix:")
    print(conf_matrix)

In [7]:
def get_feature_importances(model, X):

    # Get feature importances
    feature_importances = model.feature_importances_

    # Creating a DataFrame to display feature importances
    feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

    # Sorting the DataFrame by importance in descending order
    feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

    # Show the top 10 most important features
    top_10_features = feature_importance_df.head(10)

    return top_10_features

In [8]:
# Splitting the dataframe into features (X) and target variable (y)
X = wavelength_df.drop(['Status'], axis=1)
y = wavelength_df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [9]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

In [10]:
# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [11]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8587848932676518


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.84      0.84      0.84       203
 Hypoglycemia       0.88      0.83      0.86       200
       Normal       0.86      0.90      0.88       206

     accuracy                           0.86       609
    macro avg       0.86      0.86      0.86       609
 weighted avg       0.86      0.86      0.86       609


Confusion Matrix:
[[171  16  16]
 [ 18 167  15]
 [ 14   7 185]]


In [12]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8850574712643678


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.86      0.88      0.87       203
 Hypoglycemia       0.90      0.88      0.89       200
       Normal       0.89      0.90      0.89       206

     accuracy                           0.89       609
    macro avg       0.89      0.88      0.89       609
 weighted avg       0.89      0.89      0.89       609


Confusion Matrix:
[[178  12  13]
 [ 14 176  10]
 [ 14   7 185]]


Show feature importances for the two models.

In [13]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
2578,1961.7312,0.00385
2624,1993.1663,0.003585
2537,1933.713,0.003413
2588,1968.5649,0.003077
2622,1991.7996,0.002976
2615,1987.016,0.002895
2589,1969.2483,0.002645
2632,1998.6333,0.002511
2557,1947.3804,0.002456
2558,1948.0637,0.002421


In [14]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
2625,1993.8496,0.002194
2629,1996.5831,0.001845
2415,1850.3417,0.001841
153,304.55582,0.001767
2261,1745.1025,0.00161
2581,1963.7814,0.001605
116,279.27106,0.001565
2602,1978.1321,0.001542
2618,1989.066,0.001535
2617,1988.3827,0.001478


#### 2. Using Peak Statistics

This uses statistical properties of the peaks in each sample to be used as additional features.

In [15]:
peaks = []
widths = []
df = spectra_df.copy()

# Find the index and width of each peak
for _, group in df.groupby('SpecID'):

    #peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    #peak_index, _ = find_peaks(x=group['Absorbance'], prominence=75)

    # Calculate the widths of each peak
    widths += list(peak_widths(group['Absorbance'], peaks=peak_index, rel_height=0.5)[0])

    # Find the index of the peak within the full dataframe
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [16]:
print(len(peaks))
print(len(widths))

2114865
2114865


In [17]:
peaks_df['PeakWidths'] = widths

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['PeakWidths'] = widths


In [18]:
peaks_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PeakWidths
7,201210-1-00,7,204.7836,2690.324,201210-1,Normal,0.814277
10,201210-1-00,10,206.83371,2639.7104,201210-1,Normal,1.441874
13,201210-1-00,13,208.88382,2642.4243,201210-1,Normal,2.022615
16,201210-1-00,16,210.93394,2614.3574,201210-1,Normal,0.718095
21,201210-1-00,21,214.3508,2618.0491,201210-1,Normal,0.93965


Standard Deviation of the Absorbances appears to have a positive affect on accuracy.

In [19]:
# Create a new DataFrame for the summary statistics
peak_stats = peaks_df.groupby('SpecID').agg({'Absorbance': ['mean', 'std', 'count', 'max', 'min'],
                                             'PeakWidths': ['mean', 'std', 'max', 'min']}).reset_index()

# Flatten the multi-level columns and customize the names
peak_stats.columns = ['SpecID',
                      'PeakAbsorbance_mean', 'PeakAbsorbance_std', 'PeakAbsorbance_count', 'PeakAbsorbance_max', 'PeakAbsorbance_min',
                      'PeakWidths_mean', 'PeakWidths_std', 'PeakWidths_max', 'PeakWidths_min']

In [20]:
peak_stats = pd.merge(peak_stats, statuses, on='SpecID')
peak_stats = peak_stats.set_index('SpecID')
peak_stats = peak_stats.fillna(False)

In [21]:
peak_stats.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
201210-1-00,1588.522535,296.171266,774,2690.324,1092.8083,2.116394,4.798122,107.305604,0.500097,Normal
201210-1-01,1767.102055,321.993283,775,2985.707,1259.2762,2.122257,3.383461,54.616659,0.500664,Normal
201210-1-02,2188.622821,277.853989,745,3696.4109,1878.2166,1.990689,2.86513,36.809791,0.503868,Normal
201210-1-03,2552.458041,935.040474,687,10350.545,1705.2943,2.214918,4.89446,74.637545,0.502425,Normal
201210-1-04,2222.388623,280.358726,764,3475.2666,1725.4711,2.319962,8.282523,217.370659,0.50146,Normal


In [22]:
# Splitting the dataframe into features (X) and target variable (y)
X = peak_stats.drop(['Status'], axis=1)
y = peak_stats['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [23]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [24]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8045977011494253


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.76      0.78      0.77       203
 Hypoglycemia       0.79      0.81      0.80       200
       Normal       0.86      0.82      0.84       206

     accuracy                           0.80       609
    macro avg       0.81      0.80      0.80       609
 weighted avg       0.81      0.80      0.81       609


Confusion Matrix:
[[159  27  17]
 [ 28 162  10]
 [ 21  16 169]]


In [25]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.819376026272578


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.76      0.81      0.78       203
 Hypoglycemia       0.84      0.83      0.83       200
       Normal       0.87      0.82      0.84       206

     accuracy                           0.82       609
    macro avg       0.82      0.82      0.82       609
 weighted avg       0.82      0.82      0.82       609


Confusion Matrix:
[[165  20  18]
 [ 27 166   7]
 [ 26  12 168]]


In [26]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
4,PeakAbsorbance_min,0.1654
7,PeakWidths_max,0.136875
0,PeakAbsorbance_mean,0.128962
1,PeakAbsorbance_std,0.127743
3,PeakAbsorbance_max,0.125259
6,PeakWidths_std,0.11396
5,PeakWidths_mean,0.079414
2,PeakAbsorbance_count,0.077281
8,PeakWidths_min,0.045107


In [27]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
4,PeakAbsorbance_min,0.151218
0,PeakAbsorbance_mean,0.138936
7,PeakWidths_max,0.13595
3,PeakAbsorbance_max,0.124833
1,PeakAbsorbance_std,0.122275
6,PeakWidths_std,0.103758
5,PeakWidths_mean,0.089758
2,PeakAbsorbance_count,0.082795
8,PeakWidths_min,0.050477


Combine these peak statistics with the full wavelength

In [28]:
stats_and_spectrum = peak_stats.merge(wavelength_df.drop(columns='Status'), on='SpecID')

In [45]:
stats_and_spectrum.head()

Unnamed: 0_level_0,PeakAbsorbance_mean,PeakAbsorbance_std,PeakAbsorbance_count,PeakAbsorbance_max,PeakAbsorbance_min,PeakWidths_mean,PeakWidths_std,PeakWidths_max,PeakWidths_min,Status,...,1993.8496,1994.5331,1995.2164,1995.8998,1996.5831,1997.2665,1997.9498,1998.6333,1999.3167,2000.0
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1588.522535,296.171266,774,2690.324,1092.8083,2.116394,4.798122,107.305604,0.500097,Normal,...,1060.3231,1100.5006,1088.7416,1092.1083,1104.9304,1084.1281,1076.9363,1089.0814,1092.8083,1086.699
201210-1-01,1767.102055,321.993283,775,2985.707,1259.2762,2.122257,3.383461,54.616659,0.500664,Normal,...,1253.5012,1246.2748,1270.4456,1272.1703,1271.8768,1270.0718,1283.9667,1286.9803,1276.4037,1268.0922
201210-1-02,2188.622821,277.853989,745,3696.4109,1878.2166,1.990689,2.86513,36.809791,0.503868,Normal,...,2066.4561,2028.6669,2046.851,2094.8308,2067.8396,2043.0687,2063.5925,2086.6956,2064.7766,2064.2126
201210-1-03,2552.458041,935.040474,687,10350.545,1705.2943,2.214918,4.89446,74.637545,0.502425,Normal,...,1718.3978,1682.3824,1694.845,1710.276,1714.6768,1746.4635,1705.4204,1703.1569,1705.2943,1687.048
201210-1-04,2222.388623,280.358726,764,3475.2666,1725.4711,2.319962,8.282523,217.370659,0.50146,Normal,...,1697.4792,1725.4711,1722.2786,1757.0481,1745.6029,1728.0017,1750.2548,1747.0122,1756.1727,1747.9722


In [30]:
# Splitting the dataframe into features (X) and target variable (y)
X = stats_and_spectrum.drop(['Status'], axis=1)
X.columns = X.columns.astype(str)
y = stats_and_spectrum['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [31]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [32]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8883415435139573


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.85      0.90      0.88       203
 Hypoglycemia       0.91      0.87      0.89       200
       Normal       0.91      0.89      0.90       206

     accuracy                           0.89       609
    macro avg       0.89      0.89      0.89       609
 weighted avg       0.89      0.89      0.89       609


Confusion Matrix:
[[183  11   9]
 [ 16 174  10]
 [ 16   6 184]]


In [33]:
get_feature_importances(rf, X)

Unnamed: 0,Feature,Importance
7,PeakWidths_max,0.006369
2620,1984.2825,0.00354
2639,1997.2665,0.003424
6,PeakWidths_std,0.003407
2641,1998.6333,0.003089
2568,1948.7472,0.00306
5,PeakWidths_mean,0.00298
2546,1933.713,0.002956
2496,1899.5444,0.002886
2633,1993.1663,0.00278


In [34]:
get_feature_importances(et, X)

Unnamed: 0,Feature,Importance
7,PeakWidths_max,0.003086
6,PeakWidths_std,0.002261
2544,1932.3462,0.002159
2602,1971.9818,0.001781
2307,1770.3872,0.001768
2366,1810.7062,0.001762
2587,1961.7312,0.00172
2384,1823.0068,0.001685
2524,1918.6788,0.001669
145,292.93851,0.001581


#### 3. Creating a uniform Peak Featureset

Find the peaks in each spectra, and add their properties to the featureset.

In [35]:
peaks = []
df = spectra_df.copy()

for _, group in df.groupby('SpecID'):
    # peak_index, _ = find_peaks(x=group['Absorbance'], distance=152, prominence=42, width=6)
    peak_index, _ = find_peaks(x=group['Absorbance'])
    peaks += list(group.iloc[peak_index].index.values)

peaks_df = df.iloc[peaks]

In [36]:
peaks_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status
7,201210-1-00,7,204.78360,2690.32400,201210-1,Normal
10,201210-1-00,10,206.83371,2639.71040,201210-1,Normal
13,201210-1-00,13,208.88382,2642.42430,201210-1,Normal
16,201210-1-00,16,210.93394,2614.35740,201210-1,Normal
21,201210-1-00,21,214.35080,2618.04910,201210-1,Normal
...,...,...,...,...,...,...
8010384,210526-3-49,2619,1989.74940,849.27698,210526-3,Hyperglycemia
8010388,210526-3-49,2623,1992.48290,855.50647,210526-3,Hyperglycemia
8010390,210526-3-49,2625,1993.84960,845.08130,210526-3,Hyperglycemia
8010393,210526-3-49,2628,1995.89980,828.59833,210526-3,Hyperglycemia


Assign Peaks to bins

In [37]:
# Define a function to calculate the bin for a given wavenumber with a specified bin size
def calculate_bin_interval(wavenumber, bin_size):
    bin_start = int((wavenumber - 200) / bin_size) * bin_size + 200
    bin_end = bin_start + bin_size
    return f"{bin_start}-{bin_end}"

# Set the bin size
bin_size = 10

# Add a "Bin" column to the DataFrame
peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  peaks_df['Bin'] = peaks_df['WaveNumber'].apply(lambda x: calculate_bin_interval(x, bin_size))


In [38]:
peaks_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,Bin
7,201210-1-00,7,204.78360,2690.32400,201210-1,Normal,200-210
10,201210-1-00,10,206.83371,2639.71040,201210-1,Normal,200-210
13,201210-1-00,13,208.88382,2642.42430,201210-1,Normal,200-210
16,201210-1-00,16,210.93394,2614.35740,201210-1,Normal,210-220
21,201210-1-00,21,214.35080,2618.04910,201210-1,Normal,210-220
...,...,...,...,...,...,...,...
8010384,210526-3-49,2619,1989.74940,849.27698,210526-3,Hyperglycemia,1980-1990
8010388,210526-3-49,2623,1992.48290,855.50647,210526-3,Hyperglycemia,1990-2000
8010390,210526-3-49,2625,1993.84960,845.08130,210526-3,Hyperglycemia,1990-2000
8010393,210526-3-49,2628,1995.89980,828.59833,210526-3,Hyperglycemia,1990-2000


In [39]:
df = peaks_df.pivot_table(index='SpecID', columns='Bin', values='Absorbance', aggfunc='mean')
df.columns.name = None
statuses = peaks_df[['SpecID', 'Status']].drop_duplicates()
df = pd.merge(df, statuses, on='SpecID')
df = df.set_index('SpecID')
df = df.fillna(False)

In [40]:
df

Unnamed: 0_level_0,1000-1010,1010-1020,1020-1030,1030-1040,1040-1050,1050-1060,1060-1070,1070-1080,1080-1090,1090-1100,...,910-920,920-930,930-940,940-950,950-960,960-970,970-980,980-990,990-1000,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-00,1693.45675,1674.1428,1658.6709,1649.839425,1641.03985,1650.618625,1641.4121,1621.339875,1616.32506,1591.674767,...,1668.68786,1659.2033,1638.25235,1622.253975,1627.269833,1642.58165,1654.41948,1658.63238,1685.70838,Normal
201210-1-01,1860.082833,1845.12722,1840.979,1807.84405,1794.851775,1794.859943,1799.110925,1798.528967,1781.39976,1773.66475,...,1910.29405,1864.50745,1868.396075,1875.0776,1827.766367,1806.22562,1825.485925,1832.2086,1857.2329,Normal
201210-1-02,2246.86006,2226.59865,2208.098725,2272.379567,2281.618725,2205.830033,2182.0413,2164.2968,2155.755425,2133.36806,...,3687.3463,False,2408.860533,2224.79735,2193.997375,2201.1132,2225.60488,2218.89692,2239.302225,Normal
201210-1-03,2809.53615,2787.164333,2758.620875,2687.77348,2662.2238,2662.135629,2684.591867,2657.278425,2630.02075,2604.9416,...,3341.013033,3021.5737,2837.545233,2729.568425,2756.28044,2804.566925,2841.916525,2837.45025,2824.87862,Normal
201210-1-04,2216.192633,2191.652775,2174.003433,2167.024025,2172.59116,2173.849425,2184.44795,2174.513267,2155.9906,2171.2856,...,2131.093167,2132.228475,2125.01946,2106.155683,2123.428233,2142.63122,2162.22435,2177.376675,2180.17042,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210526-3-45,1669.288333,1646.7969,1639.84995,1643.30955,1615.88386,1616.139833,1604.6681,1602.66978,1605.65185,1593.5533,...,1624.58542,1616.645425,1611.423767,1610.79036,1619.5953,1639.7764,1638.540967,1655.304167,1675.2074,Hyperglycemia
210526-3-46,1623.018625,1603.8486,1576.643725,1587.44422,1568.5995,1564.0794,1563.34615,1554.47334,1547.20054,1551.059425,...,1584.92164,1572.1011,1559.703117,1567.535717,1574.8359,1580.11835,1590.274925,1611.720367,1615.09065,Hyperglycemia
210526-3-47,1579.450075,1552.97875,1540.5387,1542.30204,1523.2681,1521.26915,1512.5935,1510.3509,1519.7343,1514.3911,...,1545.83735,1562.19452,1524.26345,1519.369,1524.010675,1542.12045,1549.2851,1552.190025,1559.628725,Hyperglycemia
210526-3-48,1517.896617,1505.78876,1494.2506,1485.739333,1473.081,1470.212525,1473.6226,1464.199067,1462.1744,1455.31212,...,1491.01858,1482.802167,1477.675025,1474.5137,1488.2378,1490.76832,1500.0474,1505.63645,1518.0224,Hyperglycemia


In [41]:
# Splitting the dataframe into features (X) and target variable (y)
X = df.drop(['Status'], axis=1)
y = df['Status']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [42]:
# Initialising and fitting the Random Forest classifier
rf = RandomForestClassifier(random_state=1234)
rf.fit(X_train, y_train)

# Initialising and fitting the Extra Trees classifier
et = ExtraTreesClassifier(random_state=1234)
et.fit(X_train, y_train)

In [43]:
# Making predictions on the test set
y_pred = rf.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8325123152709359


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.82      0.79      0.80       203
 Hypoglycemia       0.84      0.81      0.83       200
       Normal       0.84      0.89      0.87       206

     accuracy                           0.83       609
    macro avg       0.83      0.83      0.83       609
 weighted avg       0.83      0.83      0.83       609


Confusion Matrix:
[[160  24  19]
 [ 21 163  16]
 [ 15   7 184]]


In [44]:
# Making predictions on the test set
y_pred = et.predict(X_test)

# Calculating and printing evaluation metrics
calculate_metrics(y_test, y_pred)

Overall Accuracy: 0.8407224958949097


Classification Report:
               precision    recall  f1-score   support

Hyperglycemia       0.82      0.83      0.82       203
 Hypoglycemia       0.86      0.81      0.84       200
       Normal       0.85      0.88      0.86       206

     accuracy                           0.84       609
    macro avg       0.84      0.84      0.84       609
 weighted avg       0.84      0.84      0.84       609


Confusion Matrix:
[[168  19  16]
 [ 21 163  16]
 [ 17   8 181]]
