In [1]:
import pickle, time
import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

## Load Dataset

In [2]:
file = open('df_hudgin_250_125.pkl', 'rb')
df = pickle.load(file)
file.close()

In [3]:
df.head(3)

Unnamed: 0,subject,day,session,motion,repetition,window_emg,mav,wl,ssc,zc
0,sub03,D3,S1,OH,1,"[[0.4140625, 0.6015625, -0.203125, -0.2109375,...","[0.26875, 0.1434375, 0.17390625, 0.03375, 0.03...","[18.875, 10.3515625, 14.3125, 2.296875, 2.1562...","[29, 27, 40, 29, 26, 29, 34, 42]","[24, 20, 30, 27, 18, 25, 35, 30]"
1,sub03,D3,S1,OH,1,"[[0.2421875, -0.0546875, -0.0078125, 0.015625,...","[0.2403125, 0.09796875, 0.135, 0.0196875, 0.02...","[18.3125, 6.9453125, 12.1015625, 1.4375, 2.218...","[33, 31, 41, 29, 33, 28, 37, 34]","[28, 19, 35, 21, 26, 25, 36, 28]"
2,sub03,D3,S1,OH,1,"[[-0.5546875, -0.125, -0.0625, -0.0234375, -0....","[0.153125, 0.1021875, 0.130625, 0.01890625, 0....","[11.6640625, 7.6328125, 12.078125, 1.3125, 1.9...","[31, 31, 40, 29, 29, 25, 37, 25]","[29, 26, 37, 18, 25, 22, 29, 24]"


In [4]:
df.shape

(585900, 10)

## Preprocessing

### Convert Labels to Numerics

In [5]:
df['label'] = df.motion.replace(
    df.motion.drop_duplicates().tolist(),
    [i for i in range(9)]
)

  df['label'] = df.motion.replace(


In [9]:
df.label.value_counts()

label
0    65100
1    65100
2    65100
3    65100
4    65100
5    65100
6    65100
7    65100
8    65100
Name: count, dtype: int64

### Only Account 7 Motions

In [15]:
df_7 = df[~df.motion.isin(['IN', 'GR'])]

In [17]:
df_7.motion.value_counts()

motion
OH    65100
CH    65100
EX    65100
FL    65100
SU    65100
PR    65100
RT    65100
Name: count, dtype: int64

### Concat All Features

In [16]:
def featurize(data):
    """
    Concatenate MAV, WL, SSC, and ZC features into a long list of features
    """
    features = None

    # Concatenate all relevant features
    for f in ['mav', 'wl', 'ssc', 'zc']:
        if type(features) != type(None):
            features = np.concatenate([features, np.vstack(data[f])], axis=1)
        else:
            features = np.vstack(data[f])

    return features

## Model Training Overall

In [18]:
subjects = ['sub0' + str(i) for i in range(1, 8)]
days = ['D' + str(i) for i in range(1, 16)]
sessions = ['S1', 'S2']

### Within-Session Analysis

In [151]:
ws_df = pd.DataFrame(columns=['subject', 'day', 'session', 'performance_cv', 'train_time', 'inference_time'])

# Analyse for each subject
for sub in tqdm(subjects):
    sub_df = df_7[df_7.subject == sub]
    
    # Analyse each session in each day
    for day in days:
        day_df = sub_df[sub_df.day == day]

        for sess in sessions:
            sess_df = day_df[day_df.session == sess]
            accuracies = list()
            train_times = list()
            inf_times = list()
            
            # Cross-validate 10 folds for each session
            for i in range(1, 11):
                # Devide test and train dataset
                train_df = sess_df[sess_df.repetition != i]
                test_df = sess_df[sess_df.repetition == i]

                # Set up features
                X_train = featurize(train_df)
                X_test = featurize(test_df)
                y_train = train_df.label.tolist()
                y_test = test_df.label.tolist()

                # Train Classifier
                lda = LinearDiscriminantAnalysis()
                start = time.time()
                lda.fit(X_train, y_train)
                train_time = (time.time() - start) * 1e6 # in microsecond
                train_times.append(train_time)

                # Get Accuracy
                start = time.time()
                y_pred = lda.predict(X_test)
                inf_time = (time.time() - start) * 1e6 # in microsecond
                inf_times.append(inf_time)
                accuracy = accuracy_score(y_test, y_pred)

                # Save Accuracy
                accuracies.append(accuracy)

            ws_df.loc[len(ws_df)] = [sub, day, sess, accuracies, train_times, inf_times]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:29<00:00,  4.22s/it]


In [152]:
# Average accross session
ws_df['session_mean'] = ws_df.performance_cv.apply(lambda x: np.mean(x))

In [153]:
ws_df['train_time_mean'] = ws_df.train_time.apply(lambda x: np.mean(x))

In [154]:
ws_df['inference_time_mean'] = ws_df.inference_time.apply(lambda x: np.mean(x))

In [155]:
ws_df.head()

Unnamed: 0,subject,day,session,performance_cv,train_time,inference_time,session_mean,train_time_mean,inference_time_mean
0,sub01,D1,S1,"[0.9861751152073732, 0.9769585253456221, 0.986...","[11936.90299987793, 10972.976684570312, 19439....","[112.05673217773438, 184.05914306640625, 87.02...",0.986175,11868.11924,92.935562
1,sub01,D1,S2,"[0.9308755760368663, 1.0, 1.0, 0.9907834101382...","[10228.157043457031, 9017.9443359375, 8841.037...","[78.2012939453125, 186.68174743652344, 71.0487...",0.980184,9368.276596,93.173981
2,sub01,D2,S1,"[0.9723502304147466, 0.9815668202764977, 0.953...","[9402.036666870117, 8732.795715332031, 8440.01...","[66.99562072753906, 131.13021850585938, 81.062...",0.981567,9303.045273,117.182732
3,sub01,D2,S2,"[0.8894009216589862, 0.9815668202764977, 0.949...","[11327.028274536133, 10749.101638793945, 8302....","[69.85664367675781, 49.82948303222656, 160.694...",0.972811,9831.666946,116.276741
4,sub01,D3,S1,"[0.8894009216589862, 0.9447004608294931, 0.995...","[8533.000946044922, 8562.088012695312, 10122.0...","[71.04873657226562, 50.78315734863281, 49.8294...",0.972811,8670.687675,79.751015


In [156]:
ws_df.shape

(210, 9)

In [157]:
ws_df.session_mean.mean()

0.9472767171384683

In [158]:
print(ws_df.train_time_mean.mean(), ws_df.inference_time_mean.mean())

9441.832020169213 107.35296067737397


In [159]:
# Save dataframe
file = open('./analysis/lda-2/within-session-lda.pkl', 'wb')
pickle.dump(ws_df, file)
file.close()

### Between-Sessions Analysis

In [161]:
bs_df = pd.DataFrame(columns=['subject', 'day', 'performance_cv', 'train_time', 'inference_time'])

# Analyse for each subject
for sub in tqdm(subjects):
    sub_df = df_7[df_7.subject == sub]
    
    # Analyse between sessions in a day
    for day in days:
        day_df = sub_df[sub_df.day == day]
        accuracies = list()
        train_times = list()
        inf_times = list()
        sess1_df = day_df[day_df.session == 'S1']
        sess2_df = day_df[day_df.session == 'S2']

        # Cross-validate 10 folds for each repetition in the other session
        for i in range(1, 11):
            # Devide test and train dataset
            train1_df = sess1_df[sess1_df.repetition != i]
            test1_df = sess2_df[sess2_df.repetition == i]
            train2_df = sess2_df[sess2_df.repetition != i]
            test2_df = sess1_df[sess1_df.repetition == i]

            # Set up features
            X1_train = featurize(train1_df)
            X1_test = featurize(test1_df)
            X2_train = featurize(train2_df)
            X2_test = featurize(test2_df)
            y1_train = train1_df.label.tolist()
            y1_test = test1_df.label.tolist()
            y2_train = train2_df.label.tolist()
            y2_test = test2_df.label.tolist()

            # Train Classifier
            lda1 = LinearDiscriminantAnalysis()
            lda2 = LinearDiscriminantAnalysis()

            start1 = time.time() # First model
            lda1.fit(X1_train, y1_train)
            ttime1 = (time.time() - start1) * 1e6 # in microsecond
            train_times.append(ttime1)
            
            start2 = time.time() # Second model
            lda2.fit(X2_train, y2_train)
            ttime2 = (time.time() - start2) * 1e6 # in microsecond
            train_times.append(ttime2)

            # Get Accuracy
            start1 = time.time() # First inference
            y1_pred = lda1.predict(X1_test)
            itime1 = (time.time() - start1) * 1e6 # in microsecond
            inf_times.append(itime1)

            start2 = time.time() # Second inference
            y2_pred = lda2.predict(X2_test)
            itime2 = (time.time() - start2) * 1e6 # in microsecond
            inf_times.append(itime2)
            
            acc1 = accuracy_score(y1_test, y1_pred)
            acc2 = accuracy_score(y2_test, y2_pred)

            # Save Accuracy
            accuracies.append(acc1)
            accuracies.append(acc2)

        bs_df.loc[len(bs_df)] = [sub, day, accuracies, train_times, inf_times]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:29<00:00,  4.28s/it]


In [162]:
bs_df['bs_mean'] = bs_df.performance_cv.apply(lambda x: np.mean(x))
bs_df['train_time_mean'] = bs_df.train_time.apply(lambda x: np.mean(x))
bs_df['inference_time_mean'] = bs_df.inference_time.apply(lambda x: np.mean(x))

In [163]:
bs_df.bs_mean.mean()

0.9246785165679174

In [167]:
print(bs_df.train_time_mean.mean(), bs_df.inference_time_mean.mean())

9711.67348680042 87.88562956310454


In [164]:
bs_df.head()

Unnamed: 0,subject,day,performance_cv,train_time,inference_time,bs_mean,train_time_mean,inference_time_mean
0,sub01,D1,"[0.8940092165898618, 1.0, 0.9769585253456221, ...","[10978.937149047852, 9562.73078918457, 9647.13...","[113.96408081054688, 58.88938903808594, 76.055...",0.979724,9145.641327,107.061863
1,sub01,D2,"[0.8387096774193549, 0.9769585253456221, 0.976...","[8899.927139282227, 8541.107177734375, 10882.8...","[83.92333984375, 41.961669921875, 84.877014160...",0.964977,10798.597336,77.843666
2,sub01,D3,"[0.8571428571428571, 0.9400921658986175, 0.972...","[8511.066436767578, 9396.076202392578, 8450.98...","[331.878662109375, 47.206878662109375, 67.2340...",0.965668,9127.104282,87.225437
3,sub01,D4,"[0.8709677419354839, 0.8525345622119815, 0.949...","[9103.29818725586, 8451.93862915039, 9019.1364...","[54.836273193359375, 54.12101745605469, 86.069...",0.956452,9856.438637,108.385086
4,sub01,D5,"[0.8940092165898618, 0.9815668202764977, 0.972...","[8459.091186523438, 9654.045104980469, 10845.8...","[55.07469177246094, 42.91534423828125, 52.9289...",0.974885,9883.594513,88.41753


In [165]:
bs_df.shape

(105, 8)

In [166]:
# Save dataframe
file = open('./analysis/lda-2/between-sessions-lda.pkl', 'wb')
pickle.dump(bs_df, file)
file.close()

### Between-Pair-of-Days Analysis

In [168]:
bpd_df = pd.DataFrame(columns=['subject', 'day_train', 'day_test', 'performance_cv', 'train_time', 'inference_time'])

# Analyse for each subject
for sub in tqdm(subjects):
    sub_df = df[df.subject == sub]

    # Analyse performance between pairs of days
    for day_train in days:
        for day_test in days:
            if day_train != day_test:
                dtrain_df = sub_df[sub_df.day == day_train]
                dtest_df = sub_df[sub_df.day == day_test]
                accuracies = list()
                train_times = list()
                inf_times = list()
                
                # Cross-validate 10 folds for each repetition in the other day
                for i in range(1, 11):
                    # Devide test and train dataset
                    train_df = dtrain_df[dtrain_df.repetition != i]
                    test_df = dtest_df[dtest_df.repetition == i]
        
                    # Set up features
                    X_train = featurize(train_df)
                    X_test = featurize(test_df)
                    y_train = train_df.label.tolist()
                    y_test = test_df.label.tolist()
        
                    # Train Classifier
                    lda = LinearDiscriminantAnalysis()
                    start = time.time()
                    lda.fit(X_train, y_train)
                    ttime = (time.time() - start) * 1e6 # in microsecond
                    train_times.append(ttime)
        
                    # Get Accuracy
                    start = time.time()
                    y_pred = lda.predict(X_test)
                    itime = (time.time() - start) * 1e6 # in microsecond
                    inf_times.append(itime)
                    
                    accuracy = accuracy_score(y_test, y_pred)
                    accuracies.append(accuracy)
        
                # Save Accuracy
                bpd_df.loc[len(bpd_df)] = [sub, day_train, day_test, accuracies, train_times, inf_times]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [06:36<00:00, 56.65s/it]


In [169]:
bpd_df['bpd_mean'] = bpd_df.performance_cv.apply(lambda x: np.mean(x))
bpd_df['train_time_mean'] = bpd_df.train_time.apply(lambda x: np.mean(x))
bpd_df['inference_time_mean'] = bpd_df.inference_time.apply(lambda x: np.mean(x))

In [170]:
bpd_df.bpd_mean.mean()

0.8262760588106209

In [171]:
print(bpd_df.train_time_mean.mean(), bpd_df.inference_time_mean.mean())

15734.47965440296 148.00092800944842


In [172]:
bpd_df.head()

Unnamed: 0,subject,day_train,day_test,performance_cv,train_time,inference_time,bpd_mean,train_time_mean,inference_time_mean
0,sub01,D1,D2,"[0.8405017921146953, 0.9121863799283154, 0.930...","[26171.207427978516, 25715.82794189453, 25542....","[170.9461212158203, 111.81831359863281, 88.930...",0.927419,19634.771347,129.580498
1,sub01,D1,D3,"[0.9247311827956989, 0.8745519713261649, 0.926...","[13991.117477416992, 15336.036682128906, 14504...","[82.96966552734375, 70.33348083496094, 158.071...",0.918817,14803.123474,94.604492
2,sub01,D1,D4,"[0.7634408602150538, 0.8602150537634409, 0.937...","[14326.095581054688, 15532.970428466797, 14161...","[161.17095947265625, 74.86343383789062, 338.07...",0.901613,14823.722839,154.232979
3,sub01,D1,D5,"[0.8494623655913979, 0.9480286738351255, 0.908...","[14913.082122802734, 14956.951141357422, 14958...","[224.11346435546875, 268.9361572265625, 72.240...",0.908961,15521.454811,149.440765
4,sub01,D1,D6,"[0.8924731182795699, 0.9121863799283154, 0.890...","[15619.993209838867, 15011.072158813477, 15323...","[118.01719665527344, 97.99003601074219, 154.01...",0.898029,14798.71273,172.352791


In [173]:
bpd_df.shape

(1470, 9)

In [174]:
# Save dataframe
file = open('./analysis/lda-2/between-pairwise-days-lda.pkl', 'wb')
pickle.dump(bpd_df, file)
file.close()

In [175]:
bpd_avg = bpd_df.groupby(by=['day_train', 'day_test']).mean('bpd_mean').reset_index()

In [176]:
bpd_avg_index_sorted = bpd_avg.pivot(index='day_train', columns='day_test', values='bpd_mean').sort_index(key=lambda x: x.map(lambda y: int(y[1:])))

In [177]:
bpd_avg_index_sorted = bpd_avg_index_sorted.reindex(columns=sorted(bpd_avg_index_sorted.columns, key=lambda x: int(x[1:])))

In [4]:
bpd_avg_index_sorted * 100

day_test,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15
day_train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
D1,,88.801843,83.632873,77.903226,79.472606,78.520225,79.22171,79.549411,76.154634,77.685612,77.296467,77.616487,74.805428,77.867384,75.089606
D2,87.117256,,87.119816,82.02509,83.952893,82.857143,82.800819,83.7532,81.40041,80.125448,80.921659,80.481311,78.323093,79.69022,77.534562
D3,81.915003,86.003584,,82.18638,85.811572,82.81362,85.135689,85.276498,82.18894,81.641065,82.895545,81.986687,79.173067,79.605735,76.479775
D4,77.583205,82.004608,81.484895,,85.791091,83.105479,83.927291,85.30722,85.386585,81.262161,83.417819,80.995904,81.362007,81.490015,79.557092
D5,79.098822,81.492576,84.703021,85.322581,,86.658986,86.981567,87.611367,85.412186,86.71531,85.240655,85.432668,80.670763,80.16129,77.990271
D6,81.331285,83.817204,85.491551,83.958013,88.502304,,88.27957,87.171019,84.889913,86.55914,84.633897,83.942652,79.948797,81.845878,78.822325
D7,81.059908,84.170507,85.665643,85.483871,88.146441,87.772657,,89.546851,86.052227,87.665131,86.820276,86.620584,82.647209,83.445981,80.816692
D8,78.28725,82.014849,84.285714,84.554531,87.688172,85.138249,88.878648,,88.870968,88.719918,87.099334,87.265745,84.157706,82.050691,80.716846
D9,73.806964,78.998976,79.344598,82.465438,83.860727,82.322069,83.794163,87.721454,,87.158218,86.303123,86.889401,84.759345,82.803379,83.082437
D10,74.06554,78.243728,78.90425,79.091142,84.482847,82.242704,84.608295,85.588838,86.56938,,87.585765,88.58679,83.115719,81.001024,81.438812


In [179]:
# Save dataframe
file = open('./analysis/lda-2/between-pairwise-days-avg-lda.pkl', 'wb')
pickle.dump(bpd_avg_index_sorted, file)
file.close()

In [2]:
# Save dataframe
file = open('./analysis/lda-2/between-pairwise-days-avg-lda.pkl', 'rb')
bpd_avg_index_sorted = pickle.load(file)
file.close()

### Between-Days-Leave-One-Out Analysis

In [180]:
bd_df = pd.DataFrame(columns=['subject', 'day_test', 'performance_cv', 'train_time', 'inference_time'])

# Analyse for each subject
for sub in tqdm(subjects):
    sub_df = df[df.subject == sub]

    # Analyse performance between days
    for day_test in days:
        dtrain_df = sub_df[sub_df.day != day_test]
        dtest_df = sub_df[sub_df.day == day_test]
        accuracies = list()
        train_time = list()
        inf_time = list()
        
        # Cross-validate 10 folds for each repetition in the test day
        for i in range(1, 11):
            # Devide test and train dataset
            train_df = dtrain_df[dtrain_df.repetition != i]
            test_df = dtest_df[dtest_df.repetition == i]

            # Set up features
            X_train = featurize(train_df)
            X_test = featurize(test_df)
            y_train = train_df.label.tolist()
            y_test = test_df.label.tolist()

            # Train Classifier
            lda = LinearDiscriminantAnalysis()
            start = time.time()
            lda.fit(X_train, y_train)
            ttime = (time.time() - start) * 1e6 # in microsecond
            train_time.append(ttime)

            # Get Accuracy
            start = time.time()
            y_pred = lda.predict(X_test)
            itime = (time.time() - start) * 1e6 # in microsecond
            inf_time.append(itime)
            
            accuracy = accuracy_score(y_test, y_pred)
            accuracies.append(accuracy)

        # Save Accuracy
        bd_df.loc[len(bd_df)] = [sub, day_test, accuracies, train_time, inf_time]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [04:42<00:00, 40.40s/it]


In [181]:
bd_df['bd_mean'] = bd_df.performance_cv.apply(lambda x: np.mean(x))
bd_df['train_time_mean'] = bd_df.train_time.apply(lambda x: np.mean(x))
bd_df['inference_time_mean'] = bd_df.inference_time.apply(lambda x: np.mean(x))

In [182]:
bd_df.bd_mean.mean()

0.8766922683051714

In [183]:
print(bd_df.train_time_mean.mean(), bd_df.inference_time_mean.mean())

137604.23410506477 225.95519111269996


In [184]:
# Save dataframe
file = open('./analysis/lda-2/between-days-lda.pkl', 'wb')
pickle.dump(bd_df, file)
file.close()

In [187]:
bd_df.groupby('day_test').mean('bd_mean')

Unnamed: 0_level_0,bd_mean,train_time_mean,inference_time_mean
day_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D1,0.824808,138234.220232,227.594376
D10,0.891526,137045.386859,222.853252
D11,0.895955,137130.907604,217.284475
D12,0.900461,136802.448545,238.462857
D13,0.872862,137764.082636,213.72182
D14,0.873016,136652.629716,227.825982
D15,0.859677,137228.969165,239.402907
D2,0.858653,139208.306585,209.389414
D3,0.871045,139364.273208,234.777587
D4,0.864747,136980.039733,211.140088
