#### Machine Learning

Lets test different spectral cleaning parameters by building a model pipeline.

Import Libraries

In [46]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate, GroupKFold, cross_val_score
from Spectra_Preparation_Functions import *
from mlxtend.feature_selection import SequentialFeatureSelector
import matplotlib.pyplot as plt

Read the spectral data

In [47]:
#df = pd.read_csv("../data/exosomes.raw_spectrum_1.csv")
# df = pd.read_csv("../data/raw_df_outliers_removed.csv")
df = pd.read_csv("../data/400-1800_with_raw_scaled_surface_pagerank.csv")

In [48]:
df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank
0,201210-1-00,293,400.22778,1765.6628,201210-1,Normal,0.610024
1,201210-1-00,294,400.91116,1774.7809,201210-1,Normal,0.610024
2,201210-1-00,295,401.59454,1769.0302,201210-1,Normal,0.610024
3,201210-1-00,296,402.27789,1756.4220,201210-1,Normal,0.610024
4,201210-1-00,297,402.96127,1758.8690,201210-1,Normal,0.610024
...,...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1.201164
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1.201164
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1.201164
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1.201164


In [49]:
pagerank_cutoff = 0.824056910854271
despike_ma = 92
despike_threshold = 3.5
lam = 10 ** 11
p = 0.1

In [50]:
clean_df = df[df['PageRank'] > pagerank_cutoff]
clean_df

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank
4098,201210-1-02,293,400.22778,2182.6694,201210-1,Normal,0.961147
4099,201210-1-02,294,400.91116,2149.6565,201210-1,Normal,0.961147
4100,201210-1-02,295,401.59454,2146.0227,201210-1,Normal,0.961147
4101,201210-1-02,296,402.27789,2159.3459,201210-1,Normal,0.961147
4102,201210-1-02,297,402.96127,2167.2910,201210-1,Normal,0.961147
...,...,...,...,...,...,...,...
6239200,210526-3-09,2337,1797.03870,1617.3926,210526-3,Hyperglycemia,1.201164
6239201,210526-3-09,2338,1797.72200,1633.0911,210526-3,Hyperglycemia,1.201164
6239202,210526-3-09,2339,1798.40550,1633.3076,210526-3,Hyperglycemia,1.201164
6239203,210526-3-09,2340,1799.08890,1641.8665,210526-3,Hyperglycemia,1.201164


In [51]:
clean_df['SpecID'].nunique()

1939

In [52]:
clean_df['Despiked_Absorbance'] = clean_df.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold=despike_threshold))
clean_df['Baseline'] = clean_df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
clean_df['Baseline_Corrected_Absorbance'] = clean_df['Despiked_Absorbance'] - clean_df['Baseline']
clean_df['Scaled_Absorbance'] = clean_df.groupby('SpecID')['Baseline_Corrected_Absorbance'].transform(lambda x: svn_normalise(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Despiked_Absorbance'] = clean_df.groupby('SpecID')['Absorbance'].transform(lambda x: despike_group(x, ma=despike_ma, threshold=despike_threshold))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df['Baseline'] = clean_df.groupby('SpecID')['Despiked_Absorbance'].transform(lambda x: asls_baseline_correction(x, lam=lam, p=p))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.

In [53]:
clean_df.head()

Unnamed: 0,SpecID,Seq,WaveNumber,Absorbance,SurID,Status,PageRank,Despiked_Absorbance,Baseline,Baseline_Corrected_Absorbance,Scaled_Absorbance
4098,201210-1-02,293,400.22778,2182.6694,201210-1,Normal,0.961147,2182.6694,2047.983637,134.685763,0.387154
4099,201210-1-02,294,400.91116,2149.6565,201210-1,Normal,0.961147,2149.6565,2048.003637,101.652863,0.143959
4100,201210-1-02,295,401.59454,2146.0227,201210-1,Normal,0.961147,2146.0227,2048.023636,97.999064,0.117058
4101,201210-1-02,296,402.27789,2159.3459,201210-1,Normal,0.961147,2159.3459,2048.043636,111.302264,0.215
4102,201210-1-02,297,402.96127,2167.291,201210-1,Normal,0.961147,2167.291,2048.063635,119.227365,0.273346


In [54]:
wavelength_df = prepare_wavelength_df(clean_df, 'Scaled_Absorbance')
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-02,0.387154,0.143959,0.117058,0.215,0.273346,0.226781,0.113781,0.029284,0.092672,0.061608,...,-1.318869,-1.224532,-1.171024,-1.295672,-1.318849,-1.180542,-1.333518,-1.355698,201210-1,Normal
201210-1-03,-0.322233,-0.377119,-0.406082,-0.362514,-0.284758,-0.255112,-0.285655,-0.236506,-0.305067,-0.411116,...,-1.65062,-1.536285,-1.614261,-1.582751,-1.595934,-1.554886,-1.475848,-1.616807,201210-1,Normal
201210-1-07,1.590973,1.729138,1.678946,1.716794,1.719294,1.650259,1.792391,1.972,2.141552,1.896857,...,-1.010676,-1.051011,-1.020404,-1.013684,-0.74418,-0.718906,-0.897704,-0.844931,201210-1,Normal
201210-1-09,1.840965,1.828747,1.789689,1.859896,1.951399,1.976493,1.969735,1.866808,1.681478,1.714805,...,-0.481731,-0.588972,-0.636404,-0.766483,-0.517136,-0.55925,-0.367209,-0.763921,201210-1,Normal
201210-1-10,1.147777,1.255007,0.853108,0.916829,1.121588,0.965771,0.850365,1.085054,0.643614,0.581263,...,-0.778024,-1.015345,-1.074649,-1.074681,-0.87409,-0.919638,-0.980606,-0.778525,201210-1,Normal


In [55]:
wavelength_df.head()

Unnamed: 0_level_0,400.22778,400.91116,401.59454,402.27789,402.96127,403.64465,404.32803,405.01138,405.69476,406.37814,...,1794.9886,1795.672,1796.3553,1797.0387,1797.722,1798.4055,1799.0889,1799.7722,SurID,Status
SpecID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
201210-1-02,0.387154,0.143959,0.117058,0.215,0.273346,0.226781,0.113781,0.029284,0.092672,0.061608,...,-1.318869,-1.224532,-1.171024,-1.295672,-1.318849,-1.180542,-1.333518,-1.355698,201210-1,Normal
201210-1-03,-0.322233,-0.377119,-0.406082,-0.362514,-0.284758,-0.255112,-0.285655,-0.236506,-0.305067,-0.411116,...,-1.65062,-1.536285,-1.614261,-1.582751,-1.595934,-1.554886,-1.475848,-1.616807,201210-1,Normal
201210-1-07,1.590973,1.729138,1.678946,1.716794,1.719294,1.650259,1.792391,1.972,2.141552,1.896857,...,-1.010676,-1.051011,-1.020404,-1.013684,-0.74418,-0.718906,-0.897704,-0.844931,201210-1,Normal
201210-1-09,1.840965,1.828747,1.789689,1.859896,1.951399,1.976493,1.969735,1.866808,1.681478,1.714805,...,-0.481731,-0.588972,-0.636404,-0.766483,-0.517136,-0.55925,-0.367209,-0.763921,201210-1,Normal
201210-1-10,1.147777,1.255007,0.853108,0.916829,1.121588,0.965771,0.850365,1.085054,0.643614,0.581263,...,-0.778024,-1.015345,-1.074649,-1.074681,-0.87409,-0.919638,-0.980606,-0.778525,201210-1,Normal


In [56]:
def evaluate_random_forest(df):

    # Set the Surfaces as groups
    groups = df['SurID']
    X = df.drop(['Status', 'SurID'], axis=1)
    y = df['Status']

    # Creating the Random Forest classifier
    rf = RandomForestClassifier(random_state=1234, criterion='entropy', max_depth=None, n_estimators=189, min_samples_split=3)
    
    # Using GroupKFold for classification tasks
    cv = GroupKFold(n_splits=10)
    
    # Getting cross-validation scores
    scores = cross_val_score(rf, X, y, cv=cv, scoring='accuracy',groups=groups, n_jobs=-1)
    
    # Displaying the results
    print(f'{rf.__class__.__name__} Cross-Validation Accuracy: {np.mean(scores):.4f} +/- {np.std(scores):.4f}')

In [57]:
evaluate_random_forest(wavelength_df)

RandomForestClassifier Cross-Validation Accuracy: 0.7020 +/- 0.1172


##### **Greedy Feature Selection**

In [58]:
# Set the Surfaces as groups
groups = wavelength_df['SurID']
X = wavelength_df.drop(['Status', 'SurID'], axis=1)
y = wavelength_df['Status']

# Creating the Random Forest classifier
rf = RandomForestClassifier(random_state=1234, criterion='entropy', max_depth=None, n_estimators=189, min_samples_split=3)

# Using GroupKFold for classification tasks
cv = GroupKFold(n_splits=10)

# Feature selector
sfs = SequentialFeatureSelector(rf,
                                k_features=(1, 50),
                                forward=True,
                                scoring='accuracy',
                                cv=cv,
                                n_jobs=-1,
                                verbose=2)

# Fit the feature selector
sfs = sfs.fit(X, y, groups=groups)

# Get selected feature names
selected_feature_names = X[list(sfs.k_feature_names_)]

# Create a DataFrame with selected features
selected_features_df = pd.DataFrame(selected_feature_names.columns, columns=['Selected Features'])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   55.9s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  9.7min
[Parallel(n_jobs=-1)]: Done 1961 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 2049 out of 2049 | elapsed: 13.9min finished

[2024-04-16 00:24:27] Features: 1/50 -- score: 0.4770263532594635[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: 

In [59]:
help(SequentialFeatureSelector)

Help on class SequentialFeatureSelector in module mlxtend.feature_selection.sequential_feature_selector:

class SequentialFeatureSelector(mlxtend.utils.base_compostion._BaseXComposition, sklearn.base.MetaEstimatorMixin)
 |  SequentialFeatureSelector(estimator, k_features=1, forward=True, floating=False, verbose=0, scoring=None, cv=5, n_jobs=1, pre_dispatch='2*n_jobs', clone_estimator=True, fixed_features=None, feature_groups=None)
 |  
 |  Sequential Feature Selection for Classification and Regression.
 |  
 |  Parameters
 |  ----------
 |  estimator : scikit-learn classifier or regressor
 |  k_features : int or tuple or str (default: 1)
 |      Number of features to select,
 |      where k_features < the full feature set.
 |      New in 0.4.2: A tuple containing a min and max value can be provided,
 |          and the SFS will consider return any feature combination between
 |          min and max that scored highest in cross-validation. For example,
 |          the tuple (1, 4) will 

In [60]:
selected_features_df

Unnamed: 0,Selected Features
0,449.43051
1,452.84738
2,490.4328
3,495.2164
4,504.7836
5,514.35077
6,515.03418
7,515.71753
8,519.81775
9,650.34167


In [61]:
selected_features_df.to_csv("../data/outlier_removed_search.csv")

In [62]:
selected_features_df = selected_features_df.rename(columns={"Selected Features" : "WaveNumber"})

In [63]:
# Create the figure and the first (left) axis
fig, ax1 = plt.subplots(figsize=(10, 6))

# Plot the bar plot on the first axis for Feature Importance
colors = ['lightblue'] * len(selected_features_df['WaveNumber'])
ax1.bar(selected_features_df['WaveNumber'], 1, color=colors)
ax1.set_xlabel('WaveNumber')
ax1.set_ylabel('Importance', color='tab:blue')
ax1.tick_params(axis='y', labelcolor='tab:blue')

# Create the second (right) axis, sharing the same x-axis
ax2 = ax1.twinx()

# Calculate average absorbance by WaveNumber and Status
avg_absorbance = df.groupby(['WaveNumber', 'Status'])['Absorbance'].mean().reset_index()

# Color mapping for 'Status'
color_map = {'Normal': 'blue', 'Hyperglycemia': 'orange', 'Hypoglycemia': 'green'}

# Plot average absorbance line plot on the second axis for each status
for status, group in avg_absorbance.groupby('Status'):
    ax2.plot(group['WaveNumber'], group['Absorbance'], label=status, color=color_map[status])

ax2.set_ylabel('Average Absorbance', color='tab:red')
ax2.tick_params(axis='y', labelcolor='tab:red')

# Adding legend for the line plots to distinguish between statuses
ax2.legend(title='Status')

# Title for the entire plot
plt.title('Feature Importance and Average Spectra of each Status')

# # Show the plot
# plt.tight_layout()
# plt.show()

NameError: name 'plt' is not defined