# Análisis de datos de actividad física con aprendizaje supervisado. Archivo 2.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SequentialFeatureSelector

In [2]:
df = pd.read_csv('data_set3.txt', header = None, delimiter = " ")

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,2.0,-7.139,4.193105,-1.709053,-0.017691,-4.535091,5.643398,-1.46117,-0.364405,0.061727,...,1.52273,0.325388,0.227916,0.040522,0.691939,0.739232,13.06219,8.60401,7.408485,2.140024
1,2.0,-6.739636,6.520351,-1.434409,-0.183291,-13.230455,9.966469,-0.159421,-1.152595,0.410636,...,21.670515,3.86736,0.508262,0.045961,0.718172,2.325628,5.405773,4.167626,2.986897,1.235623
2,2.0,0.282727,4.660952,-1.013207,0.326348,-10.564364,13.394672,-1.171451,-0.752986,-2.316818,...,29.986736,3.737845,2.014653,0.725725,0.823706,8.068543,1.978489,0.295042,0.10188,0.037949
3,2.0,-3.513364,1.817109,-0.499106,0.829376,-5.347727,4.262153,-1.107194,-0.556678,-0.132636,...,0.563271,0.119175,0.011387,0.010988,0.381943,2.109827,0.79897,1.500521,0.823234,0.350271
4,2.0,-0.363545,3.045831,-0.949825,-0.734333,-13.347636,13.819354,0.680512,-1.33045,-1.780818,...,16.43728,10.302964,1.491813,0.523613,1.954866,13.89347,15.766455,10.367425,2.78188,1.277786


In [4]:
df.shape

(517, 31)

In [5]:
y = df.iloc[:,0]
X = df.iloc[:,1:]

In [6]:
y.value_counts()

1.0    132
2.0    131
3.0    128
4.0    126
Name: 0, dtype: int64

Donde 1 son desplantes, 2 es correr en círculos, 3 es quedarse quieto parado y 4 es rodar en el piso.

Se aprecia que las clases están aproximadamente balanceadas.

In [7]:
kf = KFold(n_splits=5, shuffle = True)

## Evaluación del rendimiento de los modelos de clasificación

### SVM

In [8]:
clf = svm.SVC(kernel = 'linear')

acc = 0
rec = np.array([0., 0., 0., 0.])
pre = np.array([0., 0., 0., 0.])

for train_index, test_index in kf.split(X):
    
    # Training phase
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    clf.fit(X_train, y_train)

    # Test phase
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]    
    y_pred = clf.predict(X_test)
    
    acc+=(accuracy_score(y_test, y_pred))
    rec+=(recall_score(y_test, y_pred, average=None))
    pre+=(precision_score(y_test, y_pred, average=None))
    
acc = acc/5
rec = rec/5
pre = pre/5

In [9]:
print("Accuracy: ",acc)
print("Recall: ",rec)
print("Precision: ",pre)

Accuracy:  0.9091486183719194
Recall:  [0.90240537 0.9074127  1.         0.83037231]
Precision:  [0.85673981 0.9133514  1.         0.86127147]


### K-NN

In [10]:
clf = KNeighborsClassifier(n_neighbors=3)

acc = 0
rec = np.array([0., 0., 0., 0.])
pre = np.array([0., 0., 0., 0.])

for train_index, test_index in kf.split(X):
    
    # Training phase
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    clf.fit(X_train, y_train)

    # Test phase
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]    
    y_pred = clf.predict(X_test)
    
    acc+=(accuracy_score(y_test, y_pred))
    rec+=(recall_score(y_test, y_pred, average=None))
    pre+=(precision_score(y_test, y_pred, average=None))
    
acc = acc/5
rec = rec/5
pre = pre/5

In [11]:
print("Accuracy: ",acc)
print("Recall: ",rec)
print("Precision: ",pre)

Accuracy:  0.8993838685586258
Recall:  [0.90770762 0.9234086  1.         0.76226545]
Precision:  [0.8085121  0.90358974 0.96885578 0.93581818]


### MLP

In [12]:
clf = MLPClassifier(hidden_layer_sizes = (10,10),max_iter=500)

acc = 0
rec = np.array([0., 0., 0., 0.])
pre = np.array([0., 0., 0., 0.])

for train_index, test_index in kf.split(X):
    
    # Training phase
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    clf.fit(X_train, y_train)

    # Test phase
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]    
    y_pred = clf.predict(X_test)
    
    acc+=(accuracy_score(y_test, y_pred))
    rec+=(recall_score(y_test, y_pred, average=None))
    pre+=(precision_score(y_test, y_pred, average=None))
    
acc = acc/5
rec = rec/5
pre = pre/5







In [13]:
print("Accuracy: ",acc)
print("Recall: ",rec)
print("Precision: ",pre)

Accuracy:  0.9129947722180731
Recall:  [0.87751731 0.89349369 1.         0.88813929]
Precision:  [0.89864921 0.93803419 0.99354839 0.82448593]


### Random Forest

In [14]:
clf = RandomForestClassifier(max_depth=25)

acc = 0
rec = np.array([0., 0., 0., 0.])
pre = np.array([0., 0., 0., 0.])

for train_index, test_index in kf.split(X):
    
    # Training phase
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    clf.fit(X_train, y_train)

    # Test phase
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]    
    y_pred = clf.predict(X_test)
    
    acc+=(accuracy_score(y_test, y_pred))
    rec+=(recall_score(y_test, y_pred, average=None))
    pre+=(precision_score(y_test, y_pred, average=None))
    
acc = acc/5
rec = rec/5
pre = pre/5

In [15]:
print("Accuracy: ",acc)
print("Recall: ",rec)
print("Precision: ",pre)

Accuracy:  0.9304518297236744
Recall:  [0.89859307 0.95279388 1.         0.88172727]
Precision:  [0.88120154 0.91056309 1.         0.93555556]


### Naive Bayes

In [16]:
clf = GaussianNB()

acc = 0
rec = np.array([0., 0., 0., 0.])
pre = np.array([0., 0., 0., 0.])

for train_index, test_index in kf.split(X):
    
    # Training phase
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    clf.fit(X_train, y_train)

    # Test phase
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]    
    y_pred = clf.predict(X_test)
    
    acc+=(accuracy_score(y_test, y_pred))
    rec+=(recall_score(y_test, y_pred, average=None))
    pre+=(precision_score(y_test, y_pred, average=None))
    
acc = acc/5
rec = rec/5
pre = pre/5

In [17]:
print("Accuracy: ",acc)
print("Recall: ",rec)
print("Precision: ",pre)

Accuracy:  0.7717699775952204
Recall:  [0.94170351 0.82979972 1.         0.29750551]
Precision:  [0.56722519 0.8920632  1.         0.7809768 ]


## Hiperparámetros

### Hiperparámetros de cada clasificador

### SVM

In [18]:
svm.SVC().get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

### K-NN

In [19]:
KNeighborsClassifier().get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

### MLP

In [20]:
MLPClassifier().get_params()

{'activation': 'relu',
 'alpha': 0.0001,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (100,),
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'max_fun': 15000,
 'max_iter': 200,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}

### Random Forest

In [21]:
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Naive Bayes

In [22]:
GaussianNB().get_params()

{'priors': None, 'var_smoothing': 1e-09}

### Hiperparámetros óptimos KNN

In [23]:
n_neigh = np.linspace(1,11,1).astype(int)
mink = [1,2,3]
w=['uniform', 'distance'] 
parameters = {'n_neighbors':n_neigh, 'p':mink, 'weights':w}
neigh = KNeighborsClassifier()
clf = GridSearchCV(neigh, parameters, scoring = 'accuracy')
clf.fit(X, y)

#print(accuracy_score())

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([1]), 'p': [1, 2, 3],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [24]:
clf.best_params_

{'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}

In [25]:
clf.best_score_

0.8511202389843167

In [26]:
clf = KNeighborsClassifier(n_neighbors=1,p=1)

acc = 0
rec = np.array([0., 0., 0., 0.])
pre = np.array([0., 0., 0., 0.])

for train_index, test_index in kf.split(X):
    
    # Training phase
    X_train = X.iloc[train_index, :]
    y_train = y.iloc[train_index]
    clf.fit(X_train, y_train)

    # Test phase
    X_test = X.iloc[test_index, :]
    y_test = y.iloc[test_index]    
    y_pred = clf.predict(X_test)
    
    acc+=(accuracy_score(y_test, y_pred))
    rec+=(recall_score(y_test, y_pred, average=None))
    pre+=(precision_score(y_test, y_pred, average=None))
    
acc = acc/5
rec = rec/5
pre = pre/5

In [27]:
print("Accuracy: ",acc)
print("Recall: ",rec)
print("Precision: ",pre)

Accuracy:  0.9014002987303957
Recall:  [0.94521065 0.92237281 1.         0.7239387 ]
Precision:  [0.81438956 0.8825     1.         0.95330225]


## Método de selección de características.

In [28]:
ks = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29]
accs = []
clf = KNeighborsClassifier(n_neighbors=1,p=1)
for k in ks:    
    ffs = SequentialFeatureSelector(clf, n_features_to_select=k)
    ffs.fit(X, y)
    X_new = ffs.transform(X)
    acc = 0
    rec = np.zeros(4)
    pre = np.zeros(4) 
    for train_index, test_index in kf.split(X_new):  
        # Training phase
        X_train = X_new[train_index, :]
        y_train = y[train_index]
        clf.fit(X_train, y_train)     
        # Test phase
        X_test = X_new[test_index, :]
        y_test = y[test_index]
        y_pred = clf.predict(X_test)

        acc+=(accuracy_score(y_test, y_pred))
        rec+=(recall_score(y_test, y_pred, average=None))
        pre+=(precision_score(y_test, y_pred, average=None))

    acc = acc/5
    rec = rec/5
    pre = pre/5

    print("Number of features =", k)
    print("Accuracy: ",acc)
    print("Recall: ",rec)
    print("Precision: ",pre)    

Number of features = 1
Accuracy:  0.725242718446602
Recall:  [0.54170042 0.57673734 0.95245631 0.82842557]
Precision:  [0.55246008 0.59551659 0.89135221 0.83195127]


Number of features = 3
Accuracy:  0.8510268857356236
Recall:  [0.80392857 0.84672619 1.         0.7548003 ]
Precision:  [0.79181735 0.81796296 0.97609428 0.82435695]


Number of features = 5
Accuracy:  0.9051904406273339
Recall:  [0.85966176 0.9069453  1.         0.84923191]
Precision:  [0.8628602  0.90188083 0.99230769 0.86970116]


Number of features = 7
Accuracy:  0.8994025392083644
Recall:  [0.87291168 0.89377921 1.         0.82936508]
Precision:  [0.85872359 0.88586243 0.99130435 0.86415261]


Number of features = 9
Accuracy:  0.8974607916355488
Recall:  [0.87061326 0.90107913 1.         0.82023974]
Precision:  [0.85847223 0.88030719 0.99310345 0.86272202]


Number of features = 11
Accuracy:  0.8801344286781181
Recall:  [0.81595406 0.89395738 1.         0.80840404]
Precision:  [0.84349784 0.89613984 0.98666667 0.79693748]


Number of features = 13
Accuracy:  0.9031926811053026
Recall:  [0.87871573 0.91603374 1.         0.82685402]
Precision:  [0.84763638 0.86663216 0.99166667 0.90724638]


Number of features = 15
Accuracy:  0.9109970126960419
Recall:  [0.90688654 0.91238736 1.         0.82487645]
Precision:  [0.85423614 0.88862625 1.         0.90402212]


Number of features = 17
Accuracy:  0.9245332337565347
Recall:  [0.94378265 0.94150926 1.         0.81129426]
Precision:  [0.89399638 0.87217204 1.         0.94012253]


Number of features = 19
Accuracy:  0.9246825989544437
Recall:  [0.92322701 0.93745921 1.         0.83999512]
Precision:  [0.87456321 0.90837733 1.         0.92215082]


Number of features = 21
Accuracy:  0.9284540702016431
Recall:  [0.9373545  0.94844114 1.         0.83023882]
Precision:  [0.88444521 0.90556415 1.         0.9257563 ]


Number of features = 23
Accuracy:  0.9033420463032114
Recall:  [0.90334881 0.92203557 1.         0.7757842 ]
Precision:  [0.84733649 0.87068793 0.99310345 0.90943057]


Number of features = 25
Accuracy:  0.9246452576549664
Recall:  [0.93265173 0.94597902 1.         0.82221973]
Precision:  [0.88773307 0.89253037 0.99130435 0.9454416 ]


Number of features = 27
Accuracy:  0.9149925317401045
Recall:  [0.94473886 0.91650879 1.         0.79058467]
Precision:  [0.85960798 0.88111423 1.         0.929     ]


Number of features = 29
Accuracy:  0.8916915608663182
Recall:  [0.93068979 0.87313697 1.         0.7543081 ]
Precision:  [0.81764273 0.86879475 0.992      0.88665476]


Indique cuantas características son suficientes para obtener buenos resultados.

El modelo obtuvo mejores resultado utilizando entre 17 y 21 características.