In [22]:
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import os
import numpy as np
import matplotlib.pyplot as plt
import entropy as ent

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from scipy import stats
from scipy.signal import find_peaks
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier

# Author: Antonio Sutera & Yann Claess
def load_data(data_path):

    FEATURES = range(2, 33)
    N_TIME_SERIES = 3500

    # Create the training and testing samples
    LS_path = os.path.join(data_path, 'LS')
    TS_path = os.path.join(data_path, 'TS')
    X_train, X_test = [np.zeros((N_TIME_SERIES, (len(FEATURES) * 512))) for i in range(2)]

    for f in FEATURES:
        data = np.loadtxt(os.path.join(LS_path, 'LS_sensor_{}.txt'.format(f)))
        X_train[:, (f-2)*512:(f-2+1)*512] = data
        data = np.loadtxt(os.path.join(TS_path, 'TS_sensor_{}.txt'.format(f)))
        X_test[:, (f-2)*512:(f-2+1)*512] = data
    
    y_train = np.loadtxt(os.path.join(LS_path, 'activity_Id.txt'))

    print('X_train size: {}.'.format(X_train.shape))
    print('y_train size: {}.'.format(y_train.shape))
    print('X_test size: {}.'.format(X_test.shape))

    return X_train, y_train, X_test


def write_submission(y, where, submission_name='toy_submission.csv'):

    os.makedirs(where, exist_ok=True)

    SUBMISSION_PATH = os.path.join(where, submission_name)
    if os.path.exists(SUBMISSION_PATH):
        os.remove(SUBMISSION_PATH)

    y = y.astype(int)
    outputs = np.unique(y)

    # Verify conditions on the predictions
    if np.max(outputs) > 14:
        raise ValueError('Class {} does not exist.'.format(np.max(outputs)))
    if np.min(outputs) < 1:
        raise ValueError('Class {} does not exist.'.format(np.min(outputs)))
    
    # Write submission file
    with open(SUBMISSION_PATH, 'a') as file:
        n_samples = len(y)
        if n_samples != 3500:
            raise ValueError('Check the number of predicted values.')

        file.write('Id,Prediction\n')

        for n, i in enumerate(y):
            file.write('{},{}\n'.format(n+1, int(i)))

    print('Submission {} saved in {}.'.format(submission_name, SUBMISSION_PATH))

In [3]:
def feature_extraction(X_train, X_test, data_path):
    FEATURES = range(2, 33)
    N_TIME_SERIES = 3500
    N_ATTRIBUTES = 16
    
    LS_path = os.path.join(data_path, 'LS')
    TS_path = os.path.join(data_path, 'TS')
    
    new_X_train = np.zeros((N_TIME_SERIES, len(FEATURES) * N_ATTRIBUTES))
    new_X_test = np.zeros((N_TIME_SERIES, len(FEATURES) * N_ATTRIBUTES))
    
    LS_subject_id = np.loadtxt(os.path.join(LS_path, 'subject_Id.txt'))
    TS_subject_id = np.loadtxt(os.path.join(TS_path, 'subject_Id.txt'))
    
    # subject id
    new_X_train[:, 0] = LS_subject_id
    new_X_test[:, 0] = TS_subject_id
    
    for i in range(N_TIME_SERIES):
        for f in FEATURES:
            # mean
            new_X_train[i][(f-2)*N_ATTRIBUTES+1] = np.mean(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+1] = np.mean(X_test[i][(f-2)*512:(f-2+1)*512])

            # stddev
            new_X_train[i][(f-2)*N_ATTRIBUTES+2] = np.std(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+2] = np.std(X_test[i][(f-2)*512:(f-2+1)*512])

            # median
            new_X_train[i][(f-2)*N_ATTRIBUTES+3] = np.median(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+3] = np.median(X_test[i][(f-2)*512:(f-2+1)*512])

            # min
            new_X_train[i][(f-2)*N_ATTRIBUTES+4] = np.min(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+4] = np.min(X_test[i][(f-2)*512:(f-2+1)*512])

            # max
            new_X_train[i][(f-2)*N_ATTRIBUTES+5] = np.max(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+5] = np.max(X_test[i][(f-2)*512:(f-2+1)*512])
            
            # median absolute deviation
            new_X_train[i][(f-2)*N_ATTRIBUTES+6] = np.apply_along_axis(lambda x: np.median(np.absolute(x - np.median(x))), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+6] = np.apply_along_axis(lambda x: np.median(np.absolute(x - np.median(x))), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # range
            new_X_train[i][(f-2)*N_ATTRIBUTES+7] = np.max(X_train[i][(f-2)*512:(f-2+1)*512]) - np.min(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+7] = np.max(X_test[i][(f-2)*512:(f-2+1)*512]) - np.min(X_test[i][(f-2)*512:(f-2+1)*512])
            
            # interquartile range
            new_X_train[i][(f-2)*N_ATTRIBUTES+8] = np.apply_along_axis(lambda x: np.percentile(x, 75) - np.percentile(x, 25), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+8] = np.apply_along_axis(lambda x: np.percentile(x, 75) - np.percentile(x, 25), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # positive values
            new_X_train[i][(f-2)*N_ATTRIBUTES+9] = np.apply_along_axis(lambda x: np.sum(x > 0), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+9] = np.apply_along_axis(lambda x: np.sum(x > 0), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # negative values
            new_X_train[i][(f-2)*N_ATTRIBUTES+10] = np.apply_along_axis(lambda x: np.sum(x < 0), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+10] = np.apply_along_axis(lambda x: np.sum(x < 0), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # values above mean
            new_X_train[i][(f-2)*N_ATTRIBUTES+11] = np.apply_along_axis(lambda x: np.sum(x > np.mean(x)), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+11] = np.apply_along_axis(lambda x: np.sum(x > np.mean(x)), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # nb of peaks
            new_X_train[i][(f-2)*N_ATTRIBUTES+12] = np.apply_along_axis(lambda x: len(find_peaks(x)[0]), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+12] = np.apply_along_axis(lambda x: len(find_peaks(x)[0]), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # skewness
            new_X_train[i][(f-2)*N_ATTRIBUTES+13] = np.apply_along_axis(lambda x: stats.skew(x), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+13] = np.apply_along_axis(lambda x: stats.skew(x), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # kurtosis
            new_X_train[i][(f-2)*N_ATTRIBUTES+14] = np.apply_along_axis(lambda x: stats.kurtosis(x), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+14] = np.apply_along_axis(lambda x: stats.kurtosis(x), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            # mean absolute deviation
            new_X_train[i][(f-2)*N_ATTRIBUTES+15] = np.apply_along_axis(lambda x: np.mean(np.absolute(x - np.mean(x))), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+15] = np.apply_along_axis(lambda x: np.mean(np.absolute(x - np.mean(x))), 0, X_test[i][(f-2)*512:(f-2+1)*512])
    
    return new_X_train, new_X_test

In [None]:
if __name__ == '__main__':
    # Directory containing the data folders
    DATA_PATH = 'data'
    init_X_train, y_train, init_X_test = load_data(DATA_PATH)

In [20]:
    # Replace missing values
    imputer = KNNImputer(n_neighbors = 5, weights = 'distance', missing_values = -999999.99)
    init_X_train = imputer.fit_transform(init_X_train)

In [9]:
    # Feature extraction
    X_train, X_test = feature_extraction(init_X_train, init_X_test, DATA_PATH)
    print(X_train.shape, X_test.shape)
    print(X_train)

(3500, 496) (3500, 496)
[[ 2.00000000e+00  1.22061719e+02  1.70005098e-01 ...  2.86800564e-01
  -1.46018010e+00  8.73728335e+00]
 [ 3.00000000e+00  6.68134766e+01  3.45229803e-01 ... -9.97044943e+00
   9.74098619e+01  5.80215454e-04]
 [ 2.00000000e+00  1.00499531e+02  5.51130740e-01 ...  1.51895414e+00
   1.17736066e+00  4.20644028e+00]
 ...
 [ 4.00000000e+00  1.30651211e+02  2.26336061e+00 ...  1.72911636e-01
  -1.07650490e+00  8.28618599e+00]
 [ 3.00000000e+00  7.74682812e+01  3.95819185e-01 ...  6.00865217e+00
   3.42104961e+01  8.46321106e-02]
 [ 5.00000000e+00  9.71083008e+01  4.61875046e-01 ...  6.95352379e-01
  -1.47716135e+00  7.98256592e+00]]


In [23]:
    clf = RandomForestClassifier(random_state = 0, n_estimators = 1000)
    cv = ShuffleSplit(n_splits = 10, test_size=0.33, random_state=0)
    scores = cross_val_score(clf, init_X_train, y_train, cv = cv)
    score = scores.mean()
    print(score)
    
    

0.9279653679653679


NameError: name 'init_y_train' is not defined

In [24]:
    clf.fit(init_X_train, y_train)
    y_test = clf.predict(X_test)
    write_submission(y_test, 'submissions')

ValueError: X has 496 features, but DecisionTreeClassifier is expecting 15872 features as input.