## 1. Data pre-processing

In [1]:
#import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

### 1.1 Train/Dev/Test split

In [2]:
from sklearn.model_selection import train_test_split

The dataset has been already splitted into train and test sets, with a ratio of about 70/30%. 

For our classification task, in order to fine tune the hyperparameters of the model and select the best features from the data, we further split those sets. So, we end up with train, evaluation and test sets of about 60/20/20% of the overall data, respectively.

In [3]:
dataset = pd.read_csv('./data/dataset.csv')

In [4]:
# take 2/5 of the train set for cross validation and test
train_set, dev_test_set = train_test_split(dataset, test_size=0.4, random_state=12)

# take 1/2 of the test set for cross validation
test_set, dev_set = train_test_split(dev_test_set, test_size=0.5, random_state=12)

In [5]:
train_set.head()

Unnamed: 0,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,tBodyAcc-mad()-Z,tBodyAcc-max()-X,...,fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)",subject,Activity
6505,0.278909,-0.016122,-0.108804,-0.991421,-0.958884,-0.943642,-0.992854,-0.95969,-0.944281,-0.926421,...,-0.604272,-0.135149,-0.006428,-0.840651,0.548457,-0.690776,0.291574,0.11007,28,SITTING
4384,0.278544,-0.017497,-0.111402,-0.99756,-0.980832,-0.987017,-0.997578,-0.979116,-0.988149,-0.942968,...,-0.919445,-0.08911,-0.044198,-0.607149,0.601884,-0.817034,0.214795,-0.032885,22,STANDING
7480,0.243931,-0.004004,-0.122676,-0.092084,0.010789,0.199388,-0.19351,-0.041149,0.186226,0.315973,...,-0.639589,0.368217,0.310372,0.969768,-0.504041,-0.496266,0.268661,0.327231,2,WALKING_DOWNSTAIRS
5960,0.278354,-0.016345,-0.111278,-0.98523,-0.988805,-0.99076,-0.985383,-0.987864,-0.990653,-0.930099,...,-0.950759,0.332895,0.404353,0.222148,0.056738,0.604472,-0.363929,-0.648498,27,LAYING
10196,0.291115,-0.014618,-0.112456,-0.968179,-0.982618,-0.983055,-0.967095,-0.982565,-0.980663,-0.916381,...,-0.94091,-0.090369,0.008545,0.387679,-0.204587,0.449349,-0.501789,-0.490746,24,LAYING


In [6]:
y_train_set = train_set[['Activity']].values.ravel()
x_train_set = train_set.drop(['Activity','subject'], axis=1).values

x_dev_set = dev_set.drop(['Activity','subject'], axis=1).values
y_dev_set = dev_set[['Activity']].values.ravel()

x_test_set = test_set.drop(['Activity','subject'], axis=1).values
y_test_set = test_set[['Activity']].values.ravel()

### 1.2 Feature selection

In [7]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

The feature selection has the purpose to retain from the data only those features that are the most relevant, i.e. useful, purging the data from those which add few or no significant improvement.

Dimensionality reduction is different from feature selection, cause it projects the data in a new space, giving as a result a set of new features, whereas feature selection filters out the original features.

Feature selection is carried out by statical analyses, such as correlation analysis. For classification tasks on numerical data the ANOVA statistical model is known to be quite good. Then, we leverage its implementation in sklearn.

In [8]:
fs = SelectKBest(score_func=f_classif, k=561)

In [9]:
y_train_set = train_set[['Activity']].values.ravel()
x_train_set = fs.fit_transform(train_set.drop(['Activity','subject'], axis=1).values, y_train_set)

x_dev_set = fs.transform(dev_set.drop(['Activity','subject'], axis=1))
y_dev_set = dev_set[['Activity']].values.ravel()

x_test_set = fs.transform(test_set.drop(['Activity','subject'], axis=1))
y_test_set = test_set[['Activity']].values.ravel()

At this point, we separate the numeric values from the class labels, dropping the user ID which is useless in our analysis.

### 1.3 Dimension reduction

In [8]:
from sklearn.decomposition import TruncatedSVD

In [9]:
svd = TruncatedSVD(n_components=310)
x_train_set = svd.fit_transform(x_train_set)
x_dev_set = svd.transform(x_dev_set)

In [12]:
np.sum(svd.explained_variance_ratio_) * 100

99.94392155097991

### 1.4 Encoding Labels

Encodes the activity labels to numerical labels.

In [9]:
from sklearn import preprocessing


le = preprocessing.LabelEncoder()
y_train_set = le.fit_transform(y_train_set)

le = preprocessing.LabelEncoder()
y_dev_set = le.fit_transform(y_dev_set)

le = preprocessing.LabelEncoder()
y_test_set = le.fit_transform(y_test_set)

See the actual corresponding classes.

In [10]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'LAYING': 0, 'SITTING': 1, 'STANDING': 2, 'WALKING': 3, 'WALKING_DOWNSTAIRS': 4, 'WALKING_UPSTAIRS': 5}


# 2. Model tuning

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tqdm import tqdm

In [16]:
svclassifier = SVC(kernel='rbf', C=100.0, gamma=0.1)
svclassifier.fit(x_train_set, y_train_set)

SVC(C=100.0, gamma=0.1)

In [17]:
y_pred = svclassifier.predict(x_dev_set)

In [18]:
print(confusion_matrix(y_dev_set,y_pred))
print(classification_report(y_dev_set,y_pred))

[[386   0   0   0   5   0]
 [  0 337   9   0   2   0]
 [  0   6 359   0   0   0]
 [  0   0   0 366   1   0]
 [  0   0   0   1 288   0]
 [  0   0   0   1   1 298]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       391
           1       0.98      0.97      0.98       348
           2       0.98      0.98      0.98       365
           3       0.99      1.00      1.00       367
           4       0.97      1.00      0.98       289
           5       1.00      0.99      1.00       300

    accuracy                           0.99      2060
   macro avg       0.99      0.99      0.99      2060
weighted avg       0.99      0.99      0.99      2060



In [19]:
x_test_set = svd.transform(x_test_set)
y_pred = svclassifier.predict(x_test_set)

In [20]:
print(confusion_matrix(y_test_set,y_pred))
print(classification_report(y_test_set,y_pred))

[[394   0   0   0   5   0]
 [  0 352   7   0   1   0]
 [  0   8 370   0   3   0]
 [  0   0   0 324   1   0]
 [  0   0   0   0 268   0]
 [  0   0   0   0   2 325]]
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       399
           1       0.98      0.98      0.98       360
           2       0.98      0.97      0.98       381
           3       1.00      1.00      1.00       325
           4       0.96      1.00      0.98       268
           5       1.00      0.99      1.00       327

    accuracy                           0.99      2060
   macro avg       0.99      0.99      0.99      2060
weighted avg       0.99      0.99      0.99      2060



In [32]:

def select_features(train_set, dev_set, n_features):
    fs = SelectKBest(score_func=f_classif, k=n_features)
    
    x_train_set = fs.fit_transform(train_set.drop(['Activity','subject'], axis=1).values, y_train_set)

    x_dev_set = fs.transform(dev_set.drop(['Activity','subject'], axis=1))
    
    return x_train_set, x_dev_set


def reduce_features(x_train_set, x_dev_set, n_components):
    svd = TruncatedSVD(n_components=n_components)
    
    x_train_set = svd.fit_transform(x_train_set)
    
    x_dev_set = svd.transform(x_dev_set)
    
    return np.vstack((x_train_set, x_dev_set))


def tune_model(train_set, dev_set, estimator, params):
    log_result = {}
    test_fold = np.append(np.full((train_set.shape[0],), -1, dtype=int), np.full((dev_set.shape[0],), 0, dtype=int))
    ps = PredefinedSplit(test_fold)
    y_train_dev = np.vstack((train_set[['Activity']].values, dev_set[['Activity']].values)).ravel()
    for n_features in tqdm((50, 100, 300, 400, 561)):
        log_result[n_features] = {}
        x_train_set, x_dev_set = select_features(train_set, dev_set, n_features)
        for n_components in range(10, n_features, 50):
            train_dev_set = reduce_features(x_train_set, x_dev_set, n_components)
            grid = GridSearchCV(estimator=estimator, param_grid=params, scoring='accuracy', cv=ps)
            grid.fit(train_dev_set, y_train_dev)
            log_result[n_features][n_components] = (grid.best_score_, grid.best_params_)
    return log_result
            

In [20]:
params={'kernel':['linear','rbf'],'C':[1,10,100],'gamma':[1e-2,1e-3,1e-4]}
result = tune_model(train_set, dev_set, SVC(), params)

100%|██████████| 5/5 [25:36<00:00, 307.27s/it]


In [29]:
max_acc = []
index = []
for n_features in (50, 100, 300, 400, 561):
    for n_components in range(10, n_features, 50):
        max_acc.append(result[n_features][n_components][0])
        index.append((n_features, n_components))
            
i = np.argsort(np.array(max_acc))
#print(np.array(max_acc)[i[-25:]])
#print(np.array(index)[i[-25:]])
print(index[i[-5]], result[561][360])

(561, 360) (0.9912621359223301, {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'})


# GDA

In [47]:
def gda(x, y, x_test, y_test):
    y_classes, y_counts = np.unique(y, return_counts=True)
    p_y = 1.0 * y_counts/len(y)
    mu = np.array([ x[y==k].mean(axis=0) for k in y_classes])
    sigma = compute_sigma(x, y, mu, y_classes)
    #adding noise
    sigma += np.ones_like(sigma) * 1e-10
    sigma = np.linalg.pinv(sigma)
    return predict(x_test, mu, sigma, p_y)

def compute_sigma(x, y, mu, y_classes):
    x_u = x.copy()
    for i in range(len(mu)):
        x_u[y==y_classes[i]] -= mu[i]
    return x_u.T.dot(x_u) / len(y)
    
def predict(data, mu, sigma, p_y):
    return np.apply_along_axis(lambda x: (np.argmax(np.exp(-0.5 * np.sum((x - mu).dot(sigma) * (x - mu), axis =1)) * p_y)), 1, data)

def score(x, y, mu, sigma, p_y):
    return (predict(x, mu, sigma, p_y) == y).mean()

In [13]:
model = gda(x_train_set, y_train_set, x_test_set, y_test_set)

In [14]:
model

0.9393203883495146

In [15]:
gda(x_train_set, y_train_set, x_dev_set, y_dev_set)

0.9436893203883495

In [48]:
fs = SelectKBest(score_func=f_classif, k=561)
svd = TruncatedSVD(n_components=260)
        
x_train_set_r = svd.fit_transform(fs.fit_transform(train_set.drop(['Activity','subject'], axis=1).values, y_train_set))

x_dev_set_r = svd.transform(fs.transform(dev_set.drop(['Activity','subject'], axis=1)))

x_test_set_r = svd.transform(fs.transform(test_set.drop(['Activity','subject'], axis=1)))

y_pred = gda(x_train_set_r, y_train_set, x_test_set_r, y_test_set)
print(classification_report(y_test_set,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       399
           1       0.93      0.94      0.93       360
           2       0.94      0.94      0.94       381
           3       0.99      0.99      0.99       325
           4       0.99      0.96      0.97       268
           5       0.98      0.99      0.98       327

    accuracy                           0.97      2060
   macro avg       0.97      0.97      0.97      2060
weighted avg       0.97      0.97      0.97      2060



In [41]:
result = {}
for n_features in (50, 100, 300, 400, 561):
    fs = SelectKBest(score_func=f_classif, k=n_features)
    result[n_features] = {}
    for n_components in range(10, n_features, 50):
        svd = TruncatedSVD(n_components=n_components)
        
        x_train_set_r = svd.fit_transform(fs.fit_transform(train_set.drop(['Activity','subject'], axis=1).values, y_train_set))

        x_dev_set_r = svd.transform(fs.transform(dev_set.drop(['Activity','subject'], axis=1)))

        x_test_set_r = svd.transform(fs.transform(test_set.drop(['Activity','subject'], axis=1)))
        result[n_features][n_components] = gda(x_train_set_r, y_train_set, x_test_set_r, y_test_set)

In [45]:
max_acc = []
index = []
for n_features in (50, 100, 300, 400, 561):
    for n_components in range(10, n_features, 50):
        max_acc.append(result[n_features][n_components])
        index.append((n_features, n_components))
        
i = np.argsort(np.array(max_acc))
#print(np.array(max_acc)[i[-20:]])
#print(np.array(index)[i[-20:]])
print(index[i[-1]], result[561][260])

(561, 260) 0.9684466019417476


In [40]:
values[56]

0.9548543689320388

In [36]:
gda(x_train_set_r, y_train_set, x_test_set_r, y_test_set)

0.9504854368932039

In [35]:
def reduce_features_gda(x_train, x_dev, n_components):
    svd = TruncatedSVD(n_components=n_components)
    
    x_train = svd.fit_transform(x_train)
    
    x_dev = svd.transform(x_dev)
    
    return x_train_set, x_dev_set



def tune_model_gda(train_set, dev_set, estimator, params):
    log_result = {}
    test_fold = np.append(np.full((train_set.shape[0],), -1, dtype=int), np.full((dev_set.shape[0],), 0, dtype=int))
    ps = PredefinedSplit(test_fold)
    y_train_dev = np.vstack((train_set[['Activity']].values, dev_set[['Activity']].values)).ravel()
    for n_features in tqdm((50, 100, 300, 400, 561)):
        log_result[n_features] = {}
        for n_components in range(10, n_features, 50):
            x_train_set_r, x_dev_set_r = select_features(train_set, dev_set, n_features)
            x_train_set_r, x_dev_set_r = reduce_features_gda(x_train_set_r, x_dev_set_r, n_components)
            log_result[n_features][n_components] = gda(x_train_set_r, train_set[['Activity']].values.ravel(), x_dev_set_r, dev_set[['Activity']].values.ravel())
    return log_result

In [36]:
tune_model_gda(train_set, dev_set, gda, None)

100%|██████████| 5/5 [00:36<00:00,  7.23s/it]


{50: {10: 0.0},
 100: {10: 0.0, 60: 0.0},
 300: {10: 0.0, 60: 0.0, 110: 0.0, 160: 0.0, 210: 0.0, 260: 0.0},
 400: {10: 0.0,
  60: 0.0,
  110: 0.0,
  160: 0.0,
  210: 0.0,
  260: 0.0,
  310: 0.0,
  360: 0.0},
 561: {10: 0.0,
  60: 0.0,
  110: 0.0,
  160: 0.0,
  210: 0.0,
  260: 0.0,
  310: 0.0,
  360: 0.0,
  410: 0.0,
  460: 0.0,
  510: 0.0,
  560: 0.0}}