In [None]:
import os
import pickle
import random
import pandas as pd
import numpy as np
from scipy import signal

# AU definitions
au_columns_name = {}
au_columns_name['AU01_r'] = 'Inner brow raiser, upper'
au_columns_name['AU02_r'] = 'Outer brow raiser, upper'
au_columns_name['AU04_r'] = 'Brow lowerer, upper'
au_columns_name['AU05_r'] = 'Upper lid raiser, upper'
au_columns_name['AU06_r'] = 'Cheekraiser, upper'
au_columns_name['AU07_r'] = 'Lid tightener, upper'
au_columns_name['AU09_r'] = 'Nose wrinkler, lower'
au_columns_name['AU10_r'] = 'Upper lip raiser, lower'
au_columns_name['AU12_r'] = 'Lip corner puller, lower'
au_columns_name['AU14_r'] = 'Dimpler, lower'
au_columns_name['AU15_r'] = 'Lip corner depressor, lower'
au_columns_name['AU17_r'] = 'Chin raiser, lower'
au_columns_name['AU20_r'] = 'Lipstretcher, lower'
au_columns_name['AU23_r'] = 'Lip tightener, lower'
au_columns_name['AU25_r'] = 'Lips part, lower'
au_columns_name['AU26_r'] = 'Jaw drop, lower'
au_columns_name['AU45_r'] = 'Blink, upper'

def generate_dataset(subjects, studies, paradigms, stack=False):
    """Generate Dataset based on resampling method for creating a balanced dataset with the action unit information.
    """
    fps_dataset = pd.read_csv('../dataset/csv_labels/FPS_of_stutter_dataset.csv', index_col=0)
    
    for subject in subjects:
        for study in studies:
            for paradigm in paradigms:
                global X
                global Y
                
                print("{}/{}/{}".format(subject, study, paradigm))
                fps = int(fps_dataset[np.logical_and(np.logical_and(fps_dataset['Paradigm']=='CAW1', fps_dataset['Study']=='S1'), fps_dataset['Subject']==982)]["FPS"])
                max_fps = int(max(fps_dataset['FPS']))
                
                if os.path.exists(('/').join(['../dataset', 'csv_labels', subject, study, paradigm.lower(), ('_').join(['trial', 'frames', subject, study.lower(), paradigm.lower()])]) + '.csv'):
                    df_with_trials = pd.read_csv(('/').join(['../dataset', 'csv_labels', subject, study, paradigm.lower(), ('_').join(['trial', 'frames', subject, study.lower(), paradigm.lower()])]) + '.csv', sep=',')
                elif os.path.exists(('/').join(['../dataset', 'csv_labels', subject, study, paradigm.upper(), ('_').join(['trial', 'frames', subject, study.lower(), paradigm.lower()])]) + '.csv'):
                    df_with_trials = pd.read_csv(('/').join(['../dataset', 'csv_labels', subject, study, paradigm.upper(), ('_').join(['trial', 'frames', subject, study.lower(), paradigm.lower()])]) + '.csv')
 
                if os.path.exists(('/').join(['../dataset', 'au', subject, study, paradigm.lower(), ('_').join([subject, paradigm.lower()])]) + '.csv'):
                    full_au_dataset = pd.read_csv(('/').join(['../dataset', 'au', subject, study, paradigm.lower(), ('_').join([subject, paradigm.lower()])]) + '.csv', sep=', ')
                elif os.path.exists(('/').join(['../dataset', 'au', subject, study, paradigm.upper(), ('_').join([subject, paradigm.upper()])]) + '.csv'):
                    full_au_dataset = pd.read_csv(('/').join(['../dataset', 'au', subject, study, paradigm.upper(), ('_').join([subject, paradigm.upper()])]) + '.csv', sep=', ')
                elif os.path.exists(('/').join(['../dataset', 'au', subject, study, paradigm.lower(), ('_').join([subject, paradigm.upper()])]) + '.csv'):
                    full_au_dataset = pd.read_csv(('/').join(['../dataset', 'au', subject, study, paradigm.lower(), ('_').join([subject, paradigm.upper()])]) + '.csv', sep=', ')
                elif os.path.exists(('/').join(['../dataset', 'au', subject, study, paradigm.upper(), ('_').join([subject, paradigm.lower()])]) + '.csv'):
                    full_au_dataset = pd.read_csv(('/').join(['../dataset', 'au', subject, study, paradigm.upper(), ('_').join([subject, paradigm.lower()])]) + '.csv', sep=', ')
                
                #print(full_au_dataset.head())
                just_aus = full_au_dataset[sorted(list(au_columns_name.keys()))]
                # print(just_aus.head())
                just_aus['sequence'] = just_aus.apply(lambda x: list(x), axis=1)
                # just_aus.head()

                # minimum number of sequences found from the dataset csv.
                min_length = min(df_with_trials['trial_end'] - df_with_trials['trial_start'])
                # for all 50 trials per subject
                for i in range(50):
                    S1 = df_with_trials['trial_start'][i]
                    S2 = int(S1 + 1.5*fps)
                    signal_df = pd.DataFrame(list(just_aus['sequence'].iloc[S1:S2]))
                    signal_df_resampled = signal_df.apply(lambda x: signal.resample(x, int(1.5*max_fps)), axis=0)

                    # Round the values to 3 decimals (ablation study for preprocessing maybe?)
                    signal_df_resampled = np.round(signal_df_resampled, 3)
                    # Convert to numpy array
                    signal_array = np.asarray(np.round(signal_df_resampled, 3), np.float32)

                    X.append(signal_array)
                    Y.append(int(df_with_trials['code'].iloc[i]))
                    
    if stack:
        # Moving axes to DL format NHWC. However, here, we don't consider 'c (channels)'
        # because we do the permute and reshape operations during training in Keras.
        # Not the best way to do it, but we can always add code to create the dataset properly
        # to work with any DL platform. As such the only change needed to this X array
        # is to add a new axis for the channels and make it 4D NHWC or NCHW.
        X = np.dstack(X)
        X = np.moveaxis(X, [0,1], [2,1])

def stack_dataset(X):
    # Moving axes to DL format NHWC. However, here, we don't consider 'c (channels)'
    # because we do the permute and reshape operations during training in Keras.
    # Not the best way to do it, but we can always add code to create the dataset properly
    # to work with any DL platform. As such the only change needed to this X array
    # is to add a new axis for the channels and make it 4D NHWC or NCHW.
    X = np.dstack(X)
    X = np.moveaxis(X, [0,1], [2,1])
    return (X)

def correct_dataset_labels(_X, _Y):
    list_of_unwanted_indexes = []
    for i, y in enumerate(_Y):
        # print (i,y)
        if y==0:
            _Y[i] = 0
        if 0 < y < 4:
            # print(y, "Y[i] between 0 and 4. Y[i] = 1")
            _Y[i] = 1
        elif y>3:
            list_of_unwanted_indexes.append(i)    

    _X_ = [v for i,v in enumerate(_X) if i not in list_of_unwanted_indexes]
    _Y_ = [v for i,v in enumerate(_Y) if i not in list_of_unwanted_indexes]
    return(np.asarray(_X_), np.asarray(_Y_))

def save_data_to_file(subjects, studies, paradigms, X_array, X_array_corrected, X_array_balanced, Y_array, Y_array_corrected, Y_array_balanced):
    if any(p in paradigms for p in ['caw1', 'caw2', 'cw1', 'cw2']) and any(p in paradigms for p in ['wag1', 'wag2', 'wg1', 'wg2']):
        # raw data
        with open('../dataset/pickled_datasets/X_array_raw_upsampled_S1S2_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_raw_upsampled_S1S2_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array, np.float32), f)
        # label corrected data
        with open('../dataset/pickled_datasets/X_array_corrected_upsampled_S1S2_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array_corrected, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_corrected_upsampled_S1S2_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array_corrected, np.float32), f)
        # balanced data
        with open('../dataset/pickled_datasets/X_array_balanced_upsampled_S1S2_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array_balanced, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_balanced_upsampled_S1S2_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array_balanced, np.float32), f)
        print("Saved all paradigms.. ")
    elif any(p in paradigms for p in ['caw1', 'caw2', 'cw1', 'cw2']):
        # raw data
        with open('../dataset/pickled_datasets/X_array_raw_upsampled_S1S2_CueWord_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_raw_upsampled_S1S2_CueWord_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array, np.float32), f)
        # label corrected data
        with open('../dataset/pickled_datasets/X_array_corrected_upsampled_S1S2_CueWord_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array_corrected, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_corrected_upsampled_S1S2_CueWord_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array_corrected, np.float32), f)
        # balanced data
        with open('../dataset/pickled_datasets/X_array_balanced_upsampled_S1S2_CueWord_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array_balanced, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_balanced_upsampled_S1S2_CueWord_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array_balanced, np.float32), f)
        print("Saved CW paradigms.. ")
    elif any(p in paradigms for p in ['wag1', 'wag2', 'wg1', 'wg2']):
        # raw data
        with open('../dataset/pickled_datasets/X_array_raw_upsampled_S1S2_WordGo_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_raw_upsampled_S1S2_WordGo_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array, np.float32), f)
        # label corrected data
        with open('../dataset/pickled_datasets/X_array_corrected_upsampled_S1S2_WordGo_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array_corrected, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_corrected_upsampled_S1S2_WordGo_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array_corrected, np.float32), f)
        # balanced data
        with open('../dataset/pickled_datasets/X_array_balanced_upsampled_S1S2_WordGo_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(X_array_balanced, np.float32), f)
        with open('../dataset/pickled_datasets/Y_array_balanced_upsampled_S1S2_WordGo_' + str(subjects[0]) + '.pkl','wb') as f: pickle.dump(np.asarray(Y_array_balanced, np.float32), f)
        print("Saved WG paradigms.. ")
    else:
        raise ValueError('Files not saved !!!')

def generate_single_subject(subject, studies, paradigms, save=True):
    """
    ex: generate_single_subject(['942'], ['S1', 'S2'], ['caw1', 'caw2', 'cw1', 'cw2'])
    """
    subjects = subject
    generate_dataset(subjects, studies, paradigms)

    # stack datasets
    X_array = stack_dataset(X)
    Y_array = np.asarray(Y, dtype=np.int32)

    # Create label corrected dataset
    X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
    Y_hist = np.histogram(Y_array_corrected)
    Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
    print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

    print("Creating Balanced Dataset with 50-50 Split")
    try:
        # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
        # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
        # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
        # later we concatenate both data points to create a balanced dataset.
        random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
        assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
        random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
        random.shuffle(random_data_points)

    except ValueError:
        # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
        random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
        assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
        random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
        random.shuffle(random_data_points)

    X_array_balanced = X_array[random_data_points]
    Y_array_balanced = Y_array[random_data_points]

    print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
    
    if save:
        save_data_to_file(subjects, studies, paradigms, X_array, X_array_corrected, X_array_balanced, Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
paradigm_type = 'all'

# Subject 942
X = []
Y = []
subjects = ['942']
studies = ['S1', 'S2']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']

generate_dataset(subjects, studies, paradigms)

# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
# Subject 970

X = []
Y = []
subjects = ['970']
studies = ['S5']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)

# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
# Subject 971
X = []
Y = []
subjects = ['971']
studies = ['S3', 'S5']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)

subjects = ['971']
studies = ['S4']
if paradigm_type == 'all':
    paradigms = ['caw1', 'caw2', 'cw1', 'wag1', 'wag2', 'wg1', 'wg2'] # no frames in CW2
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)


# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
# Subject 982

X = []
Y = []

subjects = ['982']
studies = ['S4']
if paradigm_type == 'all':
    paradigms = ['caw1', 'caw2', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2'] # no frames in WAG1
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)

subjects = ['982']
studies = ['S3', 'S5']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)

subjects = ['982']
studies = ['S1']
if paradigm_type == 'all':
    paradigms = ['caw1', 'caw2', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2'] # word mismatch in WAG1
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)

subjects = ['982']
studies = ['S2']
if paradigm_type == 'all': # no frames in CAW1, CW1, and WG2
    paradigms = ['caw2', 'cw2', 'wag1', 'wag2', 'wg1']
elif paradigm_type == 'cw':
    paradigms = ['caw2', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1']
generate_dataset(subjects, studies, paradigms)


# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
# Subject 1131
paradigm_type = 'all'
X = []
Y = []
subjects = ['1131']
studies = ['S1', 'S2', 'S3', 'S4', 'S5']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)


# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
# Subject 1196

X = []
Y = []
subjects = ['1196']
studies = ['S5', 'S4', 'S3', 'S1', 'S2']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)


# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)

In [None]:
# Subject 1214

X = []
Y = []
subjects = ['1214']
studies = ['S1']
if paradigm_type == 'all':
    paradigms = ['caw1','caw2', 'wag1', 'wag2', 'cw1', 'cw2', 'wg1', 'wg2']
elif paradigm_type == 'cw':
    paradigms = ['caw1', 'caw2', 'cw1', 'cw2']
elif paradigm_type == 'wg':
    paradigms = ['wag1', 'wag2', 'wg1', 'wg2']
generate_dataset(subjects, studies, paradigms)


# stack datasets
X_array = stack_dataset(X)
Y_array = np.asarray(Y, dtype=np.int32)

# Create label corrected dataset
X_array_corrected, Y_array_corrected = correct_dataset_labels(X_array, Y_array)
Y_hist = np.histogram(Y_array_corrected)
Y_hist_sum = Y_hist[0][0] + Y_hist[0][-1]
print("Fluent Trials: {} ({:.2f}%), Stutter Trials: {} ({:.2f}%)".format(Y_hist[0][0], 100*(Y_hist[0][0]/Y_hist_sum), Y_hist[0][-1], 100*(Y_hist[0][-1]/Y_hist_sum)))

print("Creating Balanced Dataset with 50-50 Split")
try:
    # more fluent than stutter trials. choose randomly as much stutter trials from the fluent and concat.
    # np.where(Y_array==0) will give the indices where the array is 0 (fluent).
    # then using np.random.choice, we choose X_array[Y_array==1].shape[0] (stutter) number of samples from the fluent trials.
    # later we concatenate both data points to create a balanced dataset.
    random_data_points = np.random.choice(np.where(Y_array==0)[0], size=X_array[Y_array==1].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==1].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==1)[0]))
    random.shuffle(random_data_points)
    
except ValueError:
    # more stutter than fluent trials. choose randomly as much fluent trials from the stutter and concat.
    random_data_points = np.random.choice(np.where(Y_array==1)[0], size=X_array[Y_array==0].shape[0], replace=False)
    assert random_data_points.shape[0] == X_array[Y_array==0].shape[0]
    random_data_points = np.concatenate((random_data_points, np.where(Y_array==0)[0]))
    random.shuffle(random_data_points)

X_array_balanced = X_array[random_data_points]
Y_array_balanced = Y_array[random_data_points]

print("Balanced Shapes X:", X_array_balanced.shape, "Y:", Y_array_balanced.shape)
save_data_to_file(subjects, studies, paradigms, 
 X_array, X_array_corrected, X_array_balanced, 
 Y_array, Y_array_corrected, Y_array_balanced)