# ML for breathing - copy based on sci-kit learn 1.1.1

Applying simple, common machine learning models to segments of EMG to predict whether segment was with our without respiratory muscle effort i.e. whether we are looking at a patient created breath.

## Import libraries

In [2]:
import resurfemg.helper_functions as hf

In [3]:
hf.bad_end_cutter?

In [None]:
csv = pandas.read_csv('../researcher_interface/ML_files/for_ml_csv.csv')

In [None]:
#basic ds
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#basic system
import sys
import os
import glob


# math and signals
import math
from scipy.stats import entropy
from scipy.signal import savgol_filter
from scipy.signal import find_peaks
# demo stuff
import ipywidgets as widgets
import seaborn 

In [None]:
# ml stuff
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import confusion_matrix
import joblib

In [None]:
# our stuff
from resurfemg.config import Config
sys.path.insert(0, '../resurfemg')
import helper_functions as hf

In [None]:
# get new changes in our library i.e. the tmsisdk

sys.path.insert(0, '../resurfemg')

from tmsisdk_lite import Poly5Reader

## Import and set up data ( see ML_snippet_maker notebook for generation)

In [None]:

csv = pd.read_csv('../researcher_interface/ML_files/for_ml_csv.csv')
csv2 = pd.read_csv('../researcher_interface/ML_files/for_ml_csv2.csv')

In [None]:
csv2.head()

In [None]:
csv = csv.dropna()
csv2 = csv2.dropna()

In [None]:
#len(csv2)

In [None]:
listicle = (list(range(999)))
fu = list(map(str, listicle))

We are going to add some random values for sanity checking... ML based on random values should give us an accuracy approaching 0.5 in this case.

In [None]:
csv2['std'] = csv2[fu].std(axis=1)
csv2['max'] = csv2[fu].max(axis=1)
csv2['min'] = csv2[fu].min(axis=1)
csv2['mean'] = csv2[fu].mean(axis=1)
csv2['entropy'] = csv2[fu].apply(entropy, axis=1)
csv2['random1'] = csv2.apply(lambda x: np.random.randint(0,100-x['1'],1)[0], axis=1)
csv2['random2'] = csv2.apply(lambda x: np.random.randint(0,100-x['2'],1)[0], axis=1)


In [None]:
csv['std'] = csv[fu].std(axis=1)
csv['max'] = csv[fu].max(axis=1)
csv['min'] = csv[fu].min(axis=1)
csv['mean'] = csv[fu].mean(axis=1)
csv['entropy'] = csv[fu].apply(entropy, axis =1)
csv['random1'] = csv.apply(lambda x: np.random.randint(0,100-x['1'],1)[0], axis=1)
csv['random2'] = csv.apply(lambda x: np.random.randint(0,100-x['2'],1)[0], axis=1)

In [None]:
csv.loc[csv['label']== 'exhale', 'label'] = 0
csv.loc[csv['label']== 'inhale', 'label'] = 1
csv2.loc[csv2['label']== 'exhale', 'label'] = 0
csv2.loc[csv2['label']== 'inhale', 'label'] = 1

In [None]:
csv_to = pd.concat([csv,csv2])

Now we have a data frame of the raw data (columns 0:999) from a bunch of EMG snippets (the rows) , and some features of them e.g. maximum value, mean etc.

In [None]:
len(csv_to)

In [None]:
csv_to.head()

# Correlations

In [None]:
loco = csv_to[['std','max', 'min', 'mean', 'entropy', 'random1', 'random2', 'label']]
#seaborn.heatmap(loco)
local = pd.DataFrame(loco)
#print(type(
local['label_int']= local['label'].astype('int')

In [None]:
#local.corr()

In [None]:
%matplotlib inline
seaborn.heatmap(local.corr(), annot = True)

What is important to note is that in terms of our label, nothing even comes to 90% correlation. We need a more complex model than picking breaths b.y one parameter. Hence machine learning (ML) to the rescue

## ML

You can pick more than one feature with the shift key

In [None]:
features = widgets.SelectMultiple(
    options=['std','max', 'min', 'mean', 'entropy', 'random1', 'random2'],
    value=['min'],
    #rows=10,
    description='Features',
    disabled=False
)
features

In [None]:
features_list = list(features.value)
features_list

In [None]:
X = csv_to.drop('label', axis =1)
X = X[features_list].values
X = X.astype('float')

In [None]:
len(X)

In [None]:
y = csv_to['label'].values
y=y.astype('int')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# scale
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
svm = SVC(kernel= 'linear', random_state=1, C=0.1)
svm.fit(X_train, y_train)

In [None]:
y_pred = svm.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
ax= plt.subplot()
seaborn.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['no breath', 'breath']); ax.yaxis.set_ticklabels(['no breath', 'breath']);

In [None]:
lr = LogisticRegression(solver='liblinear', random_state=0)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
ax= plt.subplot()
seaborn.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['no breath', 'breath']); ax.yaxis.set_ticklabels(['no breath', 'breath']);

In [None]:
dt = tree.DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
y_pred = dt.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
ax= plt.subplot()
seaborn.heatmap(cm, annot=True, fmt='g', ax=ax);  
ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
ax.set_title('Confusion Matrix'); 
ax.xaxis.set_ticklabels(['no breath', 'breath']); ax.yaxis.set_ticklabels(['no breath', 'breath']);

OK, OK, a model with only max and entropy works 100%. This makes sense. Breaths are taller and more entropic..by our definition. What we would need to do is cut the breaths by expert hand, include some edge cases, then run this ML over that (potentially different) dataset. ALso remember we only had a few features in our dataset, of about 100 samples. What we can do it add 3X on data and features, with and without entropy. Will discuss with scientists exactly what features they want.

In [None]:
filename = '../ml_models/finalized_svm_model_in_111.sav'
joblib.dump(svm, filename)
 
# some time later...you can load the model from disk


In [None]:
filename = '../ml_models/finalized_lr_model_in_111.sav'
joblib.dump(lr, filename)

In [None]:
# let's explore how we might apply a model over a running emg

In [None]:
root_emg_directory = os.path.join('../not_pushed','topspin_data_anonymized')
emg_pattern = os.path.join(root_emg_directory, '**/*.Poly5')
emg_and_draeger_files = glob.glob(emg_pattern, recursive=True)

emg_files = []
draeger_files = []

for file in emg_and_draeger_files:
    if 'Draeger' in file:
        draeger_files.append(file)
    else:
        emg_files.append(file)

In [None]:
list_of_numbers_strung = []
for i in range(len(emg_files)):
    list_of_numbers_strung.append(str(i))


btn = widgets.Dropdown(
    options=list_of_numbers_strung,
    value='0',
    description='Picked File:',
    disabled=False,
)
display(btn)

In [None]:
number_chosen = int(btn.value)
file_chosen = emg_files[number_chosen] 
print("The file you chose is:",file_chosen)

In [None]:
data_emg = Poly5Reader(file_chosen)
data_samples= data_emg.samples
emg_sample_rate = data_emg.sample_rate
converted_to_seconds =  []
converted_to_samples = []
for i in range(len(data_samples[0])):
    converted_to_seconds.append(i/emg_sample_rate)
    converted_to_samples.append(i)

In [None]:
len(data_samples[1])

In [None]:
%matplotlib inline
# set up plotn
x = data_samples[:20000]
fig, axis = plt.subplots(nrows = 3, ncols = 2, figsize=(16, 6))
#ax.set_ylim([-4, 4])
axis[0,0].grid(True)
axis[0,0].plot(x[0])
axis[0,0].set(title='leads in samples')
axis[1,0].plot(x[1])
axis[2,0].plot(x[2])
axis[0,1].set(title='leads in seconds')
axis[0,1].grid(True)
axis[0,1].plot(converted_to_seconds,x[0])
axis[1,1].plot(converted_to_seconds,x[1])
axis[2,1].plot(converted_to_seconds,x[2])

In [None]:
alt_emg_processed = hf.working_pipeline_pre_ml(data_samples, 'peaks')

In [None]:
emg_processed = hf.working_pipeline_pre_ml(data_samples, 'heart')

In [None]:
plt.plot(emg_processed)

In [None]:
# we had a final step in the snippet maker to make everything positive
our_emg_processed = abs(emg_processed)

In [None]:
plt.plot(our_emg_processed[:2000])
plt.show()

In [None]:
#toy_list = list(range(1,10000))
#toy_array = np.array(toy_list)
toy_array = our_emg_processed[:80000]
index_ml_hold = []
predictions_made = []
holder = []
for slice in hf.slices_jump_slider(toy_array, 1000,1):
    ml_index_feature1 = slice.mean() #close to mean
    ml_index_feature2 = entropy(slice)
    holder.append(slice)
    ml_index_test= [ml_index_feature1, ml_index_feature2]

    index_ml_hold.append(ml_index_test)
#     # need to reshape array
X_test_live = index_ml_hold
X_test_live = sc.transform(X_test_live)
predictions_svm = svm.predict(X_test_live)
predictions_lr = lr.predict(X_test_live)
    #predictions_made.append(predictions)


In [None]:
len(index_ml_hold)

In [None]:
predictions_lr

In [None]:
sum(predictions_svm)

In [None]:
sum(predictions_lr)

In [None]:
plt.plot(predictions_svm, color='purple', alpha= 0.5)
plt.plot(predictions_lr, color='green',  alpha= 0.5)
plt.plot(toy_array*1000, alpha=0.4)

Remember at each point it looks 1000 forward, and makes a prediciton over the whole array, this we should probably shift the prediction arrays forward by 500

In [None]:
shifter = np.zeros(500) +3


In [None]:
shifted_lr = np.hstack((shifter, predictions_lr))
shifted_svm = np.hstack((shifter, predictions_svm)) 

In [None]:
plt.plot(shifted_svm, color='purple', alpha= 0.5)
plt.plot(shifted_lr, color='green',  alpha= 0.5)
plt.plot(toy_array*1000, alpha=0.4)

Perfect predictions? Let's try more cases, and see how our models do.

In [None]:
150000/2048