In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

In [2]:
# Get and clean data from all different postures
PATH = 'assets/Combined/AllSubjectsCombined/'

cUp = pd.read_csv(PATH + 'climbingUp.csv', header=None)

cDown = pd.read_csv(PATH + 'climbingDown.csv', header=None)

standing = pd.read_csv(PATH + 'standing.csv', header=None)
standing.replace("stehen", "Standing", inplace=True)

sitting = pd.read_csv(PATH + 'sitting.csv', header=None)
sitting.replace("sitzen", "Sitting", inplace=True)

walking = pd.read_csv(PATH + 'walking.csv', header=None)
walking.replace("gehen", "Walking", inplace=True)

running = pd.read_csv(PATH + 'running.csv', header=None)
running.replace("rennen", "Running", inplace=True)

recumbency = pd.read_csv(PATH + 'recumbency.csv', header=None)
recumbency.replace("liegen", "Recumbency", inplace=True)

notSpecified = pd.read_csv(PATH + 'notSpecified.csv', header=None)
notSpecified.replace("Nicht festgelegt", "Not Specified", inplace=True)

unknown = pd.read_csv(PATH + 'unknown.csv', header=None)

jumping = pd.read_csv(PATH + 'jumping.csv', header=None)
jumping.replace("springen", "Jumping", inplace=True)

In [9]:
# Sample dataframes, n = 500

cUpSample = cUp.sample(n=500)
cDownSample = cDown.sample(n=500)
standingSample = standing.sample(n=500)
sittingSample = sitting.sample(n=500)
walkingSample = walking.sample(n=500)
runningSample = running.sample(n=500)
recumbencySample = recumbency.sample(n=500)
notSpecifiedSample = notSpecified.sample(n=500)
unknownSample = unknown.sample(n=500)
jumpingSample = jumping

In [3]:
# Combine all posture dataframes into one
result = pd.concat([cUp, cDown, standing, sitting, walking, running, recumbency, notSpecified, unknown, jumping])

In [11]:
# Combine all sampled posture dataframes into one
resultSample = pd.concat([cUpSample, cDownSample, standingSample, sittingSample, walkingSample, runningSample, recumbencySample, notSpecifiedSample, unknownSample, jumpingSample])

In [12]:
# Check results for different counts in the same row, but different column
resultSample.groupby(13).count()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12
13,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Climbing (down),500,500,500,500,500,500,500,500,500,500,500,500,500
Climbing (up),500,500,500,500,500,500,500,500,500,500,500,500,500
Jumping,318,318,318,318,318,318,318,318,318,318,318,318,318
Not Specified,500,500,500,500,500,500,500,500,500,500,500,500,500
Recumbency,500,500,500,500,500,500,500,500,500,500,500,500,500
Running,500,500,500,500,500,500,500,500,500,500,500,500,500
Sitting,500,500,500,500,500,500,500,500,500,500,500,500,500
Standing,500,500,500,500,500,500,500,500,500,500,500,500,500
Walking,500,500,500,500,500,500,500,500,500,500,500,500,500
unknown,500,500,500,500,500,500,500,500,500,500,500,500,500


In [5]:
# ONLY CONTINUE WHEN RECORDS CONTAIN "NaN" (i.e. different counts, same row, different column)
# Create new dataframe, where values that are "NaN" (only in Correlation) are disregarded
result = result[np.isfinite(result[9])]

In [14]:
# Write the ARFF header. This can be merged with all.csv to create the ARFF file used to train the model

text_file = open(PATH + "arffHeader.csv", "w")

text_file.write('@RELATION features\n\n@ATTRIBUTE xMean 	NUMERIC\n@ATTRIBUTE xStdDev 	NUMERIC\n@ATTRIBUTE xIntqr 	NUMERIC\n@ATTRIBUTE yMean 	NUMERIC\n@ATTRIBUTE yStdDev 	NUMERIC\n@ATTRIBUTE yIntqr 	NUMERIC\n@ATTRIBUTE zMean 	NUMERIC\n@ATTRIBUTE zStdDev 	NUMERIC\n@ATTRIBUTE zIntqr 	NUMERIC\n@ATTRIBUTE CorrXY 	NUMERIC\n@ATTRIBUTE CorrXZ 	NUMERIC\n@ATTRIBUTE CorrYZ 	NUMERIC\n@ATTRIBUTE timeStamp NUMERIC\n@ATTRIBUTE class 	{"Sitting","Not Specified","Walking","Standing","Climbing (up)","Climbing (down)","Running","Recumbency","unknown","Jumping"}\n\n@DATA\n')

text_file.close()

In [13]:
# Write sample result to CSV file
resultSample.to_csv(PATH + "allSample500.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

In [61]:
# Write result to CSV file
result.to_csv(PATH + "all.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

In [50]:
############# Stratified Sampling BEGIN ############# 

In [76]:
# Set stratification percentage
stratfraction = 0.5
# Load only the last column with the postures
df = pd.read_csv(PATH + 'all.csv', usecols = [13])

In [77]:
# Generate the counts per value of posture
df['Obs']  = 1
gp = df.groupby(['13'])

In [78]:
# Get number of samples per posture
df2 = np.ceil(gp.count()*stratfraction)
# Generate the indices of the request sample (first entrie)
stratsample = []
for i, key in enumerate(gp.groups):
    FirstFracEntries = gp.groups[key][0:int(df2['Obs'][i])]
    stratsample.extend(FirstFracEntries) 
# Generate a list of rows to skip since read_csv doesn't have a rows to keep option
stratsample.sort
RowsToSkip = set(df.index.values).difference(stratsample)
# Load only the requested rows        
df3 = df = pd.read_csv(PATH + 'all.csv', skiprows = RowsToSkip)

In [80]:
# Write stratified sampling result to CSV file
df3.to_csv(PATH + "stratifiedall.csv", index=False, header=False, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
############# Stratified Sampling END ############# 