In [2]:
# For number crunching
import numpy as np
import pandas as pd

# For visualisation
import matplotlib.pyplot as pl 

# For prediction 
import sklearn

# Misc
from itertools import cycle
import json 
import os

In [4]:
# from utils import *
# public_data_path, metadata_path = define_paths()
# x_df, y_df = load_XY_dfs(public_data_path)
# train_x, train_y, test_x, test_y = load_train_test_arrays(public_data_path)
# train_x, train_y, test_x, test_y = simple_impute(train_x, test_x, train_y, test_y)

In [5]:
print('  numpy version: {}'.format(np.__version__))
print(' pandas version: {}'.format(pd.__version__))
print('   json version: {}'.format(json.__version__))
print('sklearn version: {}'.format(sklearn.__version__))

  numpy version: 1.19.2
 pandas version: 1.2.3
   json version: 2.0.9
sklearn version: 0.24.1


In [6]:
import sys
nb_dir = os.path.split(os.getcwd())
nb_dir2 = nb_dir[0]+'/'+nb_dir[1]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
if nb_dir2 not in sys.path:
    sys.path.append(nb_dir2)

public_data_path = nb_dir2+'/data' # 

# metadata_path = '/Users/fl20994/Documents/IAI_CDT/TB2/Applied_Data_Science/SPHERE/public_data/metadata'
# metadata_path = '../public_data/metadata'
metadata_path = nb_dir2+'/data/metadata'

# Load Data

In [7]:
"""
We will define two convenience function to load the extracted features and their 
"""


def load_sequence(file_id):
    filename = str(file_id).zfill(5) # zfill fills with 5 zeros at the beginning of the string

    df = pd.read_csv('{}/train/{}/columns_1000ms.csv'.format(public_data_path, filename))
    data = df.values
    target = np.asarray(pd.read_csv('{}/train/{}/targets.csv'.format(public_data_path, filename)))[:, 2:]

    return data, target


def load_sequences(file_ids):
    x_es = []
    y_es = []

    for file_id in file_ids:
        data, target = load_sequence(file_id)

        x_es.append(data)
        y_es.append(target)

    return np.row_stack(x_es), np.row_stack(y_es)

In [83]:
# Load the training and testing data - all from the train directory(?)
# The functions above grab the dataframes from each of the train directories and combine them together. (The dfs in those directories are themselves combinations of the data from the different modalities)
train_x, train_y = load_sequences([1, 2, 3, 4, 5, 6, 7, 8])
test_x, test_y = load_sequences([9, 10])

# also load in as csv
# First load in as pandas dataframe (to see all columsn etc.)
x_df = pd.read_csv('{}/train/{}/columns_1000ms.csv'.format(public_data_path, '00001'))
for file_id in [2,3,4,5,6,7,8,9,10]:
    filename = str(file_id).zfill(5) # zfill fills with 5 zeros at the beginning of the string

    new_df = pd.read_csv('{}/train/{}/columns_1000ms.csv'.format(public_data_path, filename))
    x_df = x_df.append(new_df)
 
y_df = pd.read_csv('{}/train/{}/targets.csv'.format(public_data_path, '00001'))
for file_id in [2,3,4,5,6,7,8,9,10]:
    filename = str(file_id).zfill(5) # zfill fills with 5 zeros at the beginning of the string

    new_df = pd.read_csv('{}/train/{}/targets.csv'.format(public_data_path, filename))
    y_df = y_df.append(new_df)

# Data Imputation

In [8]:
print ("Check whether the train/test features are all finite (before imputation)")
print ('All training data finite:', np.all(np.isfinite(train_x)))
print ('All testing data finite:', np.all(np.isfinite(test_x)))

# We will want to impute the missing data 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()
imputer.fit(train_x)

train_x = imputer.transform(train_x)
test_x = imputer.transform(test_x)

print ("Check whether the train/test features are all finite (after imputation)")
print ('All training data finite:', np.all(np.isfinite(train_x)))
print ('All testing data finite:', np.all(np.isfinite(test_x)))


# Load the label names 
labels = json.load(open(metadata_path + '/annotations.json'))
n_classes = len(labels)

"""
Note, not all data is annotated, so we select only the annotated rows
"""
train_y_has_annotation = np.isfinite(train_y.sum(1))
train_x = train_x[train_y_has_annotation]
train_y = train_y[train_y_has_annotation]

test_y_has_annotation = np.isfinite(test_y.sum(1))
test_x = test_x[test_y_has_annotation]
test_y = test_y[test_y_has_annotation]


"""
Print simple statistics regarding the number of instances
"""
print ("Training data shapes:")
print ("train_x.shape: {}".format(train_x.shape))
print ("train_y.shape: {}".format(train_y.shape))
print 

print ("Testing data shapes")
print ("test_x.shape: {}".format(test_x.shape))
print ("test_y.shape: {}".format(test_y.shape))

Check whether the train/test features are all finite (before imputation)
All training data finite: False
All testing data finite: False


ValueError: X has 20 features, but SimpleImputer is expecting 366 features as input.

# Class Weights

In [85]:
activity_names = json.load(open(metadata_path + '/annotations.json', 'r'))
class_weights = np.asarray(json.load(open(metadata_path + '/class_weights.json', 'r')))

class_prior = train_y.mean(0)

df = pd.DataFrame({
        'Activity': activity_names, 
        'Class Weight': class_weights,
        'Prior Class Distribution': class_prior
    })

df.set_index('Activity', inplace=True)

In [86]:
knn_brier = 0.2904930016955408

prior_brier = 0.29301964202920805

In [88]:
y_df.head(2)


Unnamed: 0,start,end,a_ascend,a_descend,a_jump,a_loadwalk,a_walk,p_bent,p_kneel,p_lie,...,p_stand,t_bend,t_kneel_stand,t_lie_sit,t_sit_lie,t_sit_stand,t_stand_kneel,t_stand_sit,t_straighten,t_turn
0,0.0,1.0,,,,,,,,,...,,,,,,,,,,
1,1.0,2.0,,,,,,,,,...,,,,,,,,,,


# Tensorflow model test

In [9]:
import tensorflow as tf
from numpy import argmax
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

In [10]:
# load the dataset

# We will want to impute the missing data 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()


# train_x, train_y = load_sequences([1, 2, 3, 4, 5, 6, 7, 8,9,10])
# test_x, test_y = load_sequences([9, 10])
X, dummy_y = load_sequences([1, 2, 3, 4, 5, 6, 7, 8,9,10])
imputer.fit(X)
X = imputer.transform(X)


# Load the label names 
labels = json.load(open(metadata_path + '/annotations.json'))
n_classes = len(labels)

"""
Note, not all data is annotated, so we select only the annotated rows
"""
y_has_annotation = np.isfinite(dummy_y.sum(1))
X = X[y_has_annotation]
dummy_y = dummy_y[y_has_annotation]



In [12]:
# ensure all data are floating point values
X = X.astype('float32')


array([[ 9.6679997e-01,  1.1235551e-01,  6.7799997e-01, ...,
         3.4548789e+03,  3.6600300e+03,  8.2896531e+04],
       [ 3.7689999e-01,  4.0401804e-01, -2.2800000e-01, ...,
         3.4548789e+03,  3.6600300e+03,  8.2896531e+04],
       [ 6.8070000e-01,  4.7290984e-01, -1.5800001e-01, ...,
         3.4548789e+03,  3.6600300e+03,  8.2896531e+04],
       ...,
       [-8.6680001e-01,  1.0684568e-02, -8.8999999e-01, ...,
         3.4548789e+03,  3.6600300e+03,  8.2896531e+04],
       [-8.7059999e-01,  7.7999998e-03, -8.8800001e-01, ...,
         3.4548789e+03,  3.6600300e+03,  8.2896531e+04],
       [-8.7400001e-01,  7.4833147e-03, -8.8800001e-01, ...,
         3.4548789e+03,  3.6600300e+03,  8.2896531e+04]], dtype=float32)

In [13]:
# split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# determine the number of input features
n_features = X_train.shape[1]
n_features

(10789, 366) (5315, 366) (10789, 20) (5315, 20)


366

In [14]:
y_train.shape

(10789, 20)

In [15]:
# Define the model - input_shape is the shape of each sample input

model = Sequential()
model.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_dim=n_features))
model.add(Dense(8, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(20, activation='softmax'))

# Compile the model 
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [33]:

# fit the model
model.fit(X_train, y_train, epochs=150, batch_size=32, verbose =0)

# evaluate the model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print('Test Accuracy: %.3f' % acc)

# make a prediction
row = [5.1,3.5,1.4,0.2]
yhat = model.predict([row])

print('Predicted: %s (class=%d)' % (yhat, argmax(yhat)))

InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [32,20] and labels shape [640]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-33-b5ecdd7da8da>:2) ]] [Op:__inference_train_function_2108708]

Function call stack:
train_function


# ML Mastery Tutorial

In [16]:
# multi-class classification with Keras -- importation
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras import utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [17]:
# load the dataset and impute
# We will want to impute the missing data 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer()


# train_x, train_y = load_sequences([1, 2, 3, 4, 5, 6, 7, 8,9,10])
# test_x, test_y = load_sequences([9, 10])
X, dummy_y = load_sequences([1, 2, 3, 4, 5, 6, 7, 8,9,10])
imputer.fit(X)
X = imputer.transform(X)


# Load the label names 
labels = json.load(open(metadata_path + '/annotations.json'))
n_classes = len(labels)

"""
Note, not all data is annotated, so we select only the annotated rows
"""
y_has_annotation = np.isfinite(dummy_y.sum(1))
X = X[y_has_annotation]
dummy_y = dummy_y[y_has_annotation]




In [13]:
X.shape

(16104, 366)

In [18]:
# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(30, input_dim=366, activation='relu'))
	model.add(Dense(20, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

In [19]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=1)
kfold = KFold(n_splits=5, shuffle=True)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

uracy: 0.3805
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 156/200
Epoch 157/200
Epoch 158/200
Epoch 159/200
Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 

KeyboardInterrupt: 

In [6]:

# multi-class classification with Keras
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
# from tensorflow.keras.utils.np_utils import to_categorical
from tensorflow.keras import utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
# load dataset
path = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/iris.csv'
dataframe = pd.read_csv(path, header=None)
# dataframe = pandas.read_csv("iris.data", header=None)
dataset = dataframe.values
X = dataset[:,0:4].astype(float)
Y = dataset[:,4]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = utils.to_categorical(encoded_Y)
 
# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(8, input_dim=4, activation='relu'))
	model.add(Dense(3, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model
 
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5, verbose=0)
kfold = KFold(n_splits=10, shuffle=True)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Baseline: 97.33% (3.27%)
