In [1]:
# Authors: Maxime Goffart and Olivier Joris

import os
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import KNNImputer

In [2]:
def load_data(data_path):
    """
    Load the data for the classifer.
    Modified from the method given with the assignment. Authors: Antonio Sutera & Yann Claess.

    Argument:
    ---------
    - `data_path`: Path to the data folder.
    """

    FEATURES = range(2, 33)
    N_TIME_SERIES = 3500
    
    print("Loading data...")

    # Create the training and testing samples
    LS_path = os.path.join(data_path, 'LS')
    TS_path = os.path.join(data_path, 'TS')
    X_train, X_test = [np.zeros((N_TIME_SERIES, (len(FEATURES) * 512))) for i in range(2)]

    for f in FEATURES:
        print("Loadinf feature {}...".format(f))
        data = np.loadtxt(os.path.join(LS_path, 'LS_sensor_{}.txt'.format(f)))
        X_train[:, (f-2)*512:(f-2+1)*512] = data
        data = np.loadtxt(os.path.join(TS_path, 'TS_sensor_{}.txt'.format(f)))
        X_test[:, (f-2)*512:(f-2+1)*512] = data
    
    y_train = np.loadtxt(os.path.join(LS_path, 'activity_Id.txt'))

    print('X_train size: {}.'.format(X_train.shape))
    print('y_train size: {}.'.format(y_train.shape))
    print('X_test size: {}.'.format(X_test.shape))
    
    # Replace missing values
    print("Replace missing values...")
    imputer = KNNImputer(n_neighbors = 5, weights = 'distance', missing_values = -999999.99)
    X_train = imputer.fit_transform(X_train)

    # Features selection
    print("Features selection...")
    etc = ExtraTreesClassifier(n_estimators = 1000, random_state=0)

    print("X_train shape before feature selection: " + str(X_train.shape))

    print("SelectFromModel...")
    selector = SelectFromModel(estimator = etc).fit(X_train, y_train)
    print("Transform X_train...")
    X_train = selector.transform(X_train)
    print("Transform X_test...")
    X_test = selector.transform(X_test)

    print("X_train shape after feature selection: " + str(X_train.shape))
    print("y_train shape after feature selection: " + str(y_train.shape))

    return X_train, y_train, X_test

In [3]:
X_train, y_train, X_test = load_data('data')

Loading data...
Loadinf feature 2...
Loadinf feature 3...
Loadinf feature 4...
Loadinf feature 5...
Loadinf feature 6...
Loadinf feature 7...
Loadinf feature 8...
Loadinf feature 9...
Loadinf feature 10...
Loadinf feature 11...
Loadinf feature 12...
Loadinf feature 13...
Loadinf feature 14...
Loadinf feature 15...
Loadinf feature 16...
Loadinf feature 17...
Loadinf feature 18...
Loadinf feature 19...
Loadinf feature 20...
Loadinf feature 21...
Loadinf feature 22...
Loadinf feature 23...
Loadinf feature 24...
Loadinf feature 25...
Loadinf feature 26...
Loadinf feature 27...
Loadinf feature 28...
Loadinf feature 29...
Loadinf feature 30...
Loadinf feature 31...
Loadinf feature 32...
X_train size: (3500, 15872).
y_train size: (3500,).
X_test size: (3500, 15872).
Replace missing values...
Features selection...
X_train shape before feature selection: (3500, 15872)
SelectFromModel...
Transform X_train...
Transform X_test...
X_train shape after feature selection: (3500, 4742)
y_train shape af

In [4]:
forest = RandomForestClassifier(n_estimators=40, min_samples_split=25, n_jobs=-1, random_state=0)
scores = cross_val_score(forest, X_train, y_train, cv=10, n_jobs=-1)
print(scores.mean())

0.9074285714285713
