# Some imports

In [2]:
from scipy import signal # Signal Processing Library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd # DataBase management
import seaborn # Improve Images
%matplotlib inline

# Feature Engineering

In [19]:
## Column Names ##
nm = ['ind', 'ax', 'ay', 'az', 'label'] # Names of columns
acc_nm = ['ax', 'ay', 'az'] # Names of signal features
## Load Dataset
link = "Dataset/1.csv"
def loadDB(link):
    "A function that import the database from Dataset folder and return df"
    
    label2str = {1:'Working at Computer', 2:'Standing Up, Walking and Going up-down stairs', 
                 3:'Standing', 4:'Walking',5:'Going Up\Down Stairs', 6:'Walking and Talking with Someone', 
                 7:'Talking while Standing'}
    df = pd.read_csv(link, sep=',', names=nm)
    del df['ind']
    df = df[df.label != 0] # Unusable row
    df['label_str'] = df.label.apply(lambda x:label2str[x]) # Important to some plots
    return df

df_raw = loadDB(link)

def widen_signal(df):
    # Magnitude
    df['mag'] = np.sqrt(np.square(df[acc_nm]).sum(axis=1)) 
    # Median filter - 3rd ordre
    def med_fil(df, names):
        """Filter the signal by a median filter"""
        df_r = pd.DataFrame()
        df = df[names]
        for column in df.columns:
            name = column+'_median'
            df_r[name] = signal.medfilt(df[column].values)
        return df_r
    df_med = med_fil(df, acc_nm)
    # Diffrential
    def diffrential(df, names):
        """Compute the differentials of acceleration - Jerk"""
        df = df[names]
        df_r = df.diff(periods=1, axis=0).fillna(method='backfill')
        df_r.columns = [names[0]+'_diff', names[1]+'_diff', names[2]+'_diff']
        return df_r
    df_diff = diffrential(df, acc_nm)
    # Low pass filter
    def lowpass(df, names):
        """Compute low-pass filter"""
        df = df[names]
        df_r = pd.DataFrame()
        fs = 52 # frequence sampling is 52
        f_cut = 1 # cutoff frequency
        fs_n = f_cut*2.0/fs # normalized frequency
        b,a = signal.butter(N=3, Wn=fs_n, btype='low')
        for column in df.columns : 
            name = column+'_low-p'
            df_r[name] = signal.lfilter(b,a,df[column].values)
        return df_r
    df_lp = lowpass(df, acc_nm)  
    # High pass filter 
    def highpass(df, names):
        """Compute high-pass filter"""
        df = df[names]
        df_r = pd.DataFrame()
        fs = 52 # frequence sampling is 52
        f_cut = 1 # cutoff frequency
        fs_n = f_cut*2.0/fs # normalized frequency
        b,a = signal.butter(N=3, Wn=fs_n, btype='high')
        for column in df.columns : 
            name = column+'_high-p'
            df_r[name] = signal.lfilter(b,a,df[column].values)
        return df_r
    df_hp = highpass(df, acc_nm)

    # Compute the total Total
    df = pd.concat([df, df_med, df_diff, df_lp, df_hp], axis=1)
    return df
df_widen = widen_signal(df_raw)
df_widen.shape

(162500, 18)

In [18]:
def windowing(signal,size,step):
    """Compute the window"""
    d = len(signal) #length of the signal
    nk = int(np.floor((d-size+1)/step))+1 #le nombre de fenetres
    wk = np.zeros((nk,size)) #windows
    for j in range(nk):
        wk[j,:] = signal[j*step:j*step+size]
    return wk
def window_labels(labels,size,step):
    """Compute the label of the window"""
    d = len(labels) #length of the signal
    nk = int(np.floor((d-size+1)/step))+1 #le nombre de fenetres
    labelwk = np.zeros((nk)) #window labels
    for j in range(nk):
        labelwk[j] = np.max(np.argmax(np.bincount(labels[j*step:j*step+size])))
    return labelwk
def extract_windows(df,size,step):
    
    """
    extract windows with the specified size and step from the dataframe df
    
    Returns:
    L : List of dataframes. Each dataframe contains a window extracted from each signal in df.
    labels: labels of windows
    """
    
    L = []
    n = df.shape[0]
    L_windows = dict()
    n_windows = int(np.floor((n-size+1)/step))+1
    for column in df.columns:
        if column not in ['label','label_str']:
            L_windows[column] = windowing(df[column],size,step)
    for i in range(n_windows):
        ddf = pd.DataFrame()
        for column in df.columns:
            if column not in ['label','label_str']:
                ddf[column] = L_windows[column][i,:]
        L.append(ddf)
    labels = window_labels(df['label'],size,step)
    return L,labels



def compute_features(df):
    """Compute features from a give dataframe"""
    ## Basic Statistics
    m = df.mean(axis=0).values # Mean
    ma = df.mad(axis=0).values # Median
    std = df.std(axis=0).values # Standard Deviation
    var = df.var(axis=0).values # Variance
    minimum = df.min(axis=0).values # Minimum
    maximum = df.max(axis=0).values # Maximum
    skew = df.skew(axis=0).values # Skewness
    kurt = df.kurtosis(axis=0).values # Kurtosis
    inteQ = (df.quantile(q=0.75, axis=0).values - df.quantile(q=0.25, axis=0).values) # Interquantile
    r = np.hstack([m, ma, std, var, minimum, maximum, skew, kurt, inteQ]) # Compute vector of features
    ## Auto-regressive coefficients
    
    ## Minmax
    
    ## Signal Integration
    
    return r


def compute_matrix_data(df, N_samples=52, percentage=0.5):
    """Extract Matrix of data"""
    df_X, df_Y = extract_windows(df,N_samples,int(percentage*N_samples))
    X = compute_features(df_X[0])
    for i in range(1,len(df_X)):
        vec = compute_features(df_X[i])
        X = np.vstack([X,vec])
    
    y = np.array(df_Y) # Compute the vector of labels
    return X, y

X, y = compute_matrix_data(df_widen) # Compute matrix of data
# Put in a dataframe (More flexible)
df = pd.DataFrame(X)
df['label'] = y

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,135,136,137,138,139,140,141,142,143,label
0,1598.923077,2001.673077,2004.0,3254.690706,1597.423077,1997.326923,2004.961538,4.596154,-2.711538,-3.846154,...,70.75,73.75,84.0,1223.30238,1550.704693,1558.615893,366.926569,470.672173,466.663607,1.0
1,1597.538462,1993.519231,2066.461538,3286.559354,1597.480769,1992.288462,2066.134615,1.019231,-0.538462,1.153846,...,40.5,73.75,82.0,102.899394,174.142867,74.281043,95.173809,185.847098,143.627671,1.0
2,1627.096154,2099.0,2066.192308,3367.022529,1626.25,2097.057692,2058.788462,2.019231,0.5,-2.115385,...,46.0,77.5,97.5,45.97089,88.893624,29.106268,55.871237,71.354734,80.137217,1.0
3,1703.403846,2207.961538,1979.961538,3424.511204,1698.788462,2199.211538,1963.346154,6.826923,4.115385,-8.134615,...,54.0,74.25,116.75,80.452349,174.031032,89.000927,66.065201,66.776862,116.828517,1.0
4,1882.596154,2265.307692,1847.326923,3482.999132,1883.173077,2261.480769,1837.153846,9.807692,-0.615385,-2.480769,...,50.25,54.0,79.75,152.166466,23.009431,137.223311,84.423848,55.550298,136.962691,1.0


# Classical algorithms

In [22]:
## Methods
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # Ensemble
from sklearn.svm import SVC # SVM
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression # Logistic Regression
## Utils
from sklearn.model_selection import GridSearchCV # Choose parameters
from sklearn.preprocessing import scale # Normalise matrix
from sklearn.metrics import confusion_matrix 
from sklearn.utils import shuffle
from sklearn.model_selection import cross_val_score

In [23]:
X_train = X
y_train = y
X_train, y_train = shuffle(X_train, y_train, random_state=0) # Shuffle data
X_train = scale(X_train) # Scale the matrix of data

In [24]:
lr = LogisticRegressionCV()
cv_lr = min(cross_val_score(lr, X_train, y_train, cv=5))
print("Logistic Regression"+str(cv_lr))
svm = SVC(C=15, kernel='rbf')
cv_svm = min(cross_val_score(lr, X_train, y_train, cv=5))
print("SVM"+str(cv_svm))

[ 0.88977636  0.88089528  0.8896      0.8974359   0.88461538]


In [None]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
 ]
svm = SVC()
clf = GridSearchCV(svm, param_grid)
clf.fit(X_train, y_train)