In [7]:
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import os
import numpy as np
import matplotlib.pyplot as plt
import entropy as ent
import random

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score
from scipy import stats
from scipy.signal import find_peaks
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier

# Author: Antonio Sutera & Yann Claess
def load_data(data_path):

    FEATURES = range(2, 33)
    N_TIME_SERIES = 3500

    # Create the training and testing samples
    LS_path = os.path.join(data_path, 'LS')
    TS_path = os.path.join(data_path, 'TS')
    X_train, X_test = [np.zeros((N_TIME_SERIES, (len(FEATURES) * 512))) for i in range(2)]

    for f in FEATURES:
        data = np.loadtxt(os.path.join(LS_path, 'LS_sensor_{}.txt'.format(f)))
        X_train[:, (f-2)*512:(f-2+1)*512] = data
        data = np.loadtxt(os.path.join(TS_path, 'TS_sensor_{}.txt'.format(f)))
        X_test[:, (f-2)*512:(f-2+1)*512] = data
    
    y_train = np.loadtxt(os.path.join(LS_path, 'activity_Id.txt'))

    print('X_train size: {}.'.format(X_train.shape))
    print('y_train size: {}.'.format(y_train.shape))
    print('X_test size: {}.'.format(X_test.shape))

    return X_train, y_train, X_test


def write_submission(y, where, submission_name='toy_submission.csv'):

    os.makedirs(where, exist_ok=True)

    SUBMISSION_PATH = os.path.join(where, submission_name)
    if os.path.exists(SUBMISSION_PATH):
        os.remove(SUBMISSION_PATH)

    y = y.astype(int)
    outputs = np.unique(y)

    # Verify conditions on the predictions
    if np.max(outputs) > 14:
        raise ValueError('Class {} does not exist.'.format(np.max(outputs)))
    if np.min(outputs) < 1:
        raise ValueError('Class {} does not exist.'.format(np.min(outputs)))
    
    # Write submission file
    with open(SUBMISSION_PATH, 'a') as file:
        n_samples = len(y)
        if n_samples != 3500:
            raise ValueError('Check the number of predicted values.')

        file.write('Id,Prediction\n')

        for n, i in enumerate(y):
            file.write('{},{}\n'.format(n+1, int(i)))

    print('Submission {} saved in {}.'.format(submission_name, SUBMISSION_PATH))

In [26]:
def feature_extraction(X_train, X_test, data_path):
    FEATURES = range(2, 33)
    N_TIME_SERIES = 3500
    N_ATTRIBUTES = 16
    
    LS_path = os.path.join(data_path, 'LS')
    TS_path = os.path.join(data_path, 'TS')
    
    new_X_train = np.zeros((N_TIME_SERIES, len(FEATURES) * N_ATTRIBUTES))
    new_X_test = np.zeros((N_TIME_SERIES, len(FEATURES) * N_ATTRIBUTES))
    
    LS_subject_id = np.loadtxt(os.path.join(LS_path, 'subject_Id.txt'))
    TS_subject_id = np.loadtxt(os.path.join(TS_path, 'subject_Id.txt'))
    
    # subject id
    new_X_train[:, 0] = LS_subject_id
    new_X_test[:, 0] = TS_subject_id
        
    for i in range(N_TIME_SERIES):
        for f in FEATURES:
            index = 1
            # mean
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.mean(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.mean(X_test[i][(f-2)*512:(f-2+1)*512])

            index += 1
            # stddev
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.std(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.std(X_test[i][(f-2)*512:(f-2+1)*512])

            index += 1
            # median
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.median(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.median(X_test[i][(f-2)*512:(f-2+1)*512])

            index += 1
            # min
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.min(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.min(X_test[i][(f-2)*512:(f-2+1)*512])

            index += 1
            # max
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.max(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.max(X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # median absolute deviation
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.median(np.absolute(x - np.median(x))), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.median(np.absolute(x - np.median(x))), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # range
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.max(X_train[i][(f-2)*512:(f-2+1)*512]) - np.min(X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.max(X_test[i][(f-2)*512:(f-2+1)*512]) - np.min(X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # interquartile range
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.percentile(x, 75) - np.percentile(x, 25), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.percentile(x, 75) - np.percentile(x, 25), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # positive values
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.sum(x > 0), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.sum(x > 0), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # negative values
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.sum(x < 0), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.sum(x < 0), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # values above mean
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.sum(x > np.mean(x)), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.sum(x > np.mean(x)), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # nb of peaks
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: len(find_peaks(x)[0]), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: len(find_peaks(x)[0]), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # skewness
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: stats.skew(x), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: stats.skew(x), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # kurtosis
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: stats.kurtosis(x), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: stats.kurtosis(x), 0, X_test[i][(f-2)*512:(f-2+1)*512])
            
            index += 1
            # mean absolute deviation
            new_X_train[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.mean(np.absolute(x - np.mean(x))), 0, X_train[i][(f-2)*512:(f-2+1)*512])
            new_X_test[i][(f-2)*N_ATTRIBUTES+index] = np.apply_along_axis(lambda x: np.mean(np.absolute(x - np.mean(x))), 0, X_test[i][(f-2)*512:(f-2+1)*512])
    
    return new_X_train, new_X_test

In [17]:
if __name__ == '__main__':
    # Directory containing the data folders
    DATA_PATH = 'data'
    init_X_train, y_train, init_X_test = load_data(DATA_PATH)

X_train size: (3500, 15872).
y_train size: (3500,).
X_test size: (3500, 15872).


In [18]:
    # Replace missing values
    imputer = KNNImputer(n_neighbors = 5, weights = 'distance', missing_values = -999999.99)
    init_X_train = imputer.fit_transform(init_X_train)

In [27]:
    # Feature extraction
    X_train, X_test = feature_extraction(init_X_train, init_X_test, DATA_PATH)

In [28]:
    LS_path = os.path.join(DATA_PATH, 'LS')   
    LS_subject_id = np.loadtxt(os.path.join(LS_path, 'subject_Id.txt'))
    
    iter_nb = 10
    
    ids = [1, 2, 3, 4, 5]
    learning_id = [0, 0, 0]   
    scores = np.zeros(iter_nb)
    
    random.seed()
    
    learning_id = [0,0,0]
    
    for j in range(iter_nb):
        random.shuffle(ids)
        
        for i in range(3):
            learning_id[i] = ids[i]
            
        unique_ls, count_ls = np.unique(LS_subject_id, return_counts = True)
        
        count = np.asarray((unique_ls, count_ls))
        
        training_size = int(count[1][learning_id[0] - 1] + count[1][learning_id[1] - 1] + count[1][learning_id[2] - 1])
        
        X_train_split = np.zeros((training_size, X_train.shape[1]))
        X_test_split = np.zeros((3500 - training_size, X_test.shape[1]))

        y_train_split = np.zeros((training_size))
        y_test_split = np.zeros((3500 - training_size))

        training_current_size, testing_current_size = 0, 0

        for i in range(3500):
            if LS_subject_id[i] in learning_id:
                X_train_split[training_current_size] = X_train[i]
                y_train_split[training_current_size] = y_train[i]
                training_current_size += 1
            else:
                X_test_split[testing_current_size] = X_train[i]
                y_test_split[testing_current_size] = y_train[i]
                testing_current_size += 1
       
        clf = RandomForestClassifier(random_state = 0, n_estimators= 1000).fit(X_train_split, y_train_split)
        y_pred = clf.predict(X_test_split)
        scores[j] = accuracy_score(y_test_split, y_pred)
    
    print(np.mean(scores))

0.7003418551418993


In [35]:
    clf.fit(X_train, y_train)
    y_test = clf.predict(X_test)
    write_submission(y_test, 'submissions')

Submission toy_submission.csv saved in submissions/toy_submission.csv.
