# Group and Split

Code that splits data in training and validation sets, while keeping the same segments for every subject together.


In [1]:
import pandas as pd
import numpy as np

In [4]:
data = pd.read_csv("../Data/preprocessed_data_final.csv")

data = data.drop(['Unnamed: 0'], axis=1)
data

Unnamed: 0,TestID,Conditie,SubjectID,SegmentNr,WalkingDirection,SegmentData,Height,Weight,Age,BMI,Balance_MLrange,Balance_MLstdev,Balance_MLmeanVelocity,Balance_APrange,Balance_APstdev',Balance_APmeanVelocity,Balance_MeanVelocity,GaitVelocity,MovementVelocity
0,006A01',norm,6,1,away,70x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.130575,1.133883
1,006A01',norm,6,2,towards,62x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.237257,1.246712
2,006A01',norm,6,3,away,61x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.274628,1.286475
3,006A01',norm,6,4,towards,70x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.060707,1.082892
4,009A01',norm,9,1,away,62x4 cell,195,86.0,23,22.616700,12.458077,2.611063,4.441906,5.751258,1.077709,1.763952,5.022392,1.093219,1.107944
5,009A01',norm,9,2,towards,69x4 cell,195,86.0,23,22.616700,12.458077,2.611063,4.441906,5.751258,1.077709,1.763952,5.022392,1.136413,1.138623
6,009A01',norm,9,3,away,70x4 cell,195,86.0,23,22.616700,12.458077,2.611063,4.441906,5.751258,1.077709,1.763952,5.022392,1.107233,1.112921
7,009A01',norm,9,4,towards,68x4 cell,195,86.0,23,22.616700,12.458077,2.611063,4.441906,5.751258,1.077709,1.763952,5.022392,1.123102,1.124893
8,010A01',norm,10,1,away,80x4 cell,165,65.0,28,23.875115,10.540746,2.647560,3.597897,3.566472,0.860375,1.478939,4.189830,0.962063,0.970962
9,010A01',norm,10,2,towards,90x4 cell,165,65.0,28,23.875115,10.540746,2.647560,3.597897,3.566472,0.860375,1.478939,4.189830,0.811524,0.824569


In [5]:
sidlist = data['SubjectID'].unique()

#Make an array out of all rows for every unique SubjectID and list them together
grouped_data = [data[data['SubjectID']==sid] for sid in sidlist]
grouped_data[0]

Unnamed: 0,TestID,Conditie,SubjectID,SegmentNr,WalkingDirection,SegmentData,Height,Weight,Age,BMI,Balance_MLrange,Balance_MLstdev,Balance_MLmeanVelocity,Balance_APrange,Balance_APstdev',Balance_APmeanVelocity,Balance_MeanVelocity,GaitVelocity,MovementVelocity
0,006A01',norm,6,1,away,70x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.130575,1.133883
1,006A01',norm,6,2,towards,62x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.237257,1.246712
2,006A01',norm,6,3,away,61x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.274628,1.286475
3,006A01',norm,6,4,towards,70x4 cell,177,70.0,20,22.343516,10.421641,2.753424,3.233032,8.955034,2.367358,2.674099,4.725911,1.060707,1.082892
360,006B01',bril,6,1,away,91x4 cell,177,70.0,20,22.343516,16.03555,4.575012,8.982034,11.286483,2.829856,3.584385,10.095526,0.781418,0.786058
361,006B01',bril,6,2,towards,82x4 cell,177,70.0,20,22.343516,16.03555,4.575012,8.982034,11.286483,2.829856,3.584385,10.095526,0.926847,0.930523
362,006B01',bril,6,3,away,77x4 cell,177,70.0,20,22.343516,16.03555,4.575012,8.982034,11.286483,2.829856,3.584385,10.095526,0.93273,0.940267
363,006B01',bril,6,4,towards,79x4 cell,177,70.0,20,22.343516,16.03555,4.575012,8.982034,11.286483,2.829856,3.584385,10.095526,0.958261,0.961042


In [7]:
#Splits data into training and validation according to specified ratio
def validation_split(data, ratio):
    np.random.shuffle(data)
    N = len(data)
    training = data[:int(ratio*N)]
    validation = data[(int(ratio*N)):]
    return training,validation

# Converts the arrays in a list to one flattend dataframe
def flatten(data):
    return pd.DataFrame(np.vstack(data), columns=['TestID', 'Conditie', 'SubjectID','SegmentNr','WalkingDirection',        
 'SegmentData', 'Height', 'Weight', 'Age', 'BMI', 'Balance_MLrange', 'Balance_MLstdev', 'Balance_MLmeanVelocity',
 'Balance_APrange', "Balance_APstdev'", 'Balance_APmeanVelocity', 'Balance_MeanVelocity', 'GaitVelocity', 'MovementVelocity'])

training,validation = validation_split(grouped_data, 0.7)
training = flatten(training)
validation = flatten(validation)
training

Unnamed: 0,TestID,Conditie,SubjectID,SegmentNr,WalkingDirection,SegmentData,Height,Weight,Age,BMI,Balance_MLrange,Balance_MLstdev,Balance_MLmeanVelocity,Balance_APrange,Balance_APstdev',Balance_APmeanVelocity,Balance_MeanVelocity,GaitVelocity,MovementVelocity
0,158A01',norm,158,1,away,69x4 cell,190,90,39,24.9307,16.0268,5.19749,3.87369,5.53695,1.53929,1.62204,4.51306,1.11566,1.12375
1,158A01',norm,158,2,towards,68x4 cell,190,90,39,24.9307,16.0268,5.19749,3.87369,5.53695,1.53929,1.62204,4.51306,1.10332,1.10488
2,158A01',norm,158,3,away,69x4 cell,190,90,39,24.9307,16.0268,5.19749,3.87369,5.53695,1.53929,1.62204,4.51306,1.15603,1.16156
3,158A01',norm,158,4,towards,70x4 cell,190,90,39,24.9307,16.0268,5.19749,3.87369,5.53695,1.53929,1.62204,4.51306,1.08023,1.08891
4,158B01',bril,158,1,away,84x4 cell,190,90,39,24.9307,24.9281,6.66753,11.2991,11.173,2.61447,5.46335,13.7144,0.917822,0.947785
5,158B01',bril,158,2,towards,77x4 cell,190,90,39,24.9307,24.9281,6.66753,11.2991,11.173,2.61447,5.46335,13.7144,1.01578,1.02319
6,158B01',bril,158,3,away,86x4 cell,190,90,39,24.9307,24.9281,6.66753,11.2991,11.173,2.61447,5.46335,13.7144,0.936816,0.95533
7,158B01',bril,158,4,towards,73x4 cell,190,90,39,24.9307,24.9281,6.66753,11.2991,11.173,2.61447,5.46335,13.7144,1.0026,1.02901
8,159A01',norm,159,1,away,65x4 cell,174,67,36,22.1297,23.1343,6.83411,5.43758,12.3539,3.90858,2.62881,6.62689,1.17226,1.17281
9,159A01',norm,159,2,towards,70x4 cell,174,67,36,22.1297,23.1343,6.83411,5.43758,12.3539,3.90858,2.62881,6.62689,1.07831,1.08005
