In this notebook we would like to see how the classifier behaves when predicting labels starting from data obtained by our phone. We expect some problems due to the phone difference (IPhone 6 vs Honor 20), sensors sensitivity and scaling. Let’s see what we can do.

In [None]:
import pandas as pd
from matplotlib import pyplot as plt
import sys
sys.path.append("creation_functions")
from creation_functions.utilities import create_time_series, preprocessing

In [None]:
df_train = preprocessing(create_time_series())
df_test = preprocessing(create_time_series(dataset="/honor20readings"))

X_train = df_train.drop("class", axis=1)
y_train = df_train["class"]

X_test = df_test.drop("class", axis=1)
y_test = df_test["class"]

K_FEATURES = 40

In [None]:
from sklearn.feature_selection import SelectKBest
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from imblearn.pipeline import Pipeline
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.tree import DecisionTreeClassifier

tree_pipe=Pipeline([
    ("scaler", RobustScaler()),
    ("sampling", SMOTE()),
    ("feature-selector", SelectKBest(k=K_FEATURES)),
    ("classifier", DecisionTreeClassifier())
])

tree_pipe.fit(X_train, y_train)
y_pred = tree_pipe.predict(X_test)
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=df_train["class"].unique(), cmap=plt.cm.Blues)

In [None]:
from sklearn.model_selection import train_test_split
import seaborn as sns

count=0
for i in range(0, y_pred.size):
    if y_pred[i]==y_test[i]:
        count+=1
acc_phone=count/y_pred.size

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, shuffle=True, random_state=42)
tree_pipe.fit(X_train, y_train)
y_pred = tree_pipe.predict(X_test)
count=0
for true, predicted in zip(y_test, y_pred):
    if true==predicted:
        count+=1
acc=count/y_pred.size

ax=sns.barplot(x=["DeviceMotion","Phone data"], y=[acc, acc_phone])
ax.set(xlabel="Dataset", ylabel="Accuracy")

We can easily see that the data obtained by our phone is often misclassified, obtaining a really low accuracy compared to the score obtained classifying only readings coming from the DeviceMotion dataset. Looking at the confusion matrix we can see the most relevant miss classifications. In particular standing is always recognized as sitting, walking as jogging, and upstairs and downstairs are never recognized properly. Why do we obtain those poor results? Let’s take a look at jogging into the 2 different datasets, to compare the signals.

In [None]:
df_jog_deviceMotion=pd.read_csv("A_DeviceMotion_data/jog_9/sub_1.csv").drop(["Unnamed: 0", "attitude.roll", "attitude.pitch", "attitude.yaw", "gravity.x", "gravity.y", "gravity.z"], axis=1)[100:400]
df_jog_honor=pd.read_csv("../honor20readings_original/dataset_honor20_jogging1.csv").drop("class", axis=1)[100:400]

In [None]:
f, axes=plt.subplots(3, 4, figsize=(20,10), sharey=True)
f.suptitle("DeviceMotion vs Honor, jogging", fontsize=20)
i=0
j=0
for column in df_jog_deviceMotion:
    axes[i][j].plot(df_jog_deviceMotion[column])
    axes[i][j].set_xlabel("DeviceMotion "+column)
    j+=1
    axes[i][j].plot(df_jog_honor[column])
    axes[i][j].set_xlabel("Honor "+column)
    if j==3:
        j=0
        i+=1
    else: j+=1

In [None]:
df_jog_honor_scaled = pd.read_csv("../honor20readings/dataset_honor20_jogging1.csv").drop("class", axis=1)[100:400]
f, axes=plt.subplots(3, 4, figsize=(20,10), sharey=True)
f.suptitle("DeviceMotion vs Honor scaled, jogging", fontsize=20)
i=0
j=0
for column in df_jog_deviceMotion:
    axes[i][j].plot(df_jog_deviceMotion[column])
    axes[i][j].set_xlabel("DeviceMotion "+column)
    j+=1
    axes[i][j].plot(df_jog_honor_scaled[column])
    axes[i][j].set_xlabel("Honor Scaled "+column)
    if j==3:
        j=0
        i+=1
    else: j+=1

In [None]:
df_honor_scaled=

In [None]:
X_test_scaled = df_honor_scaled.drop("class", axis=1)
y_test_scaled = df_honor_scaled["class"]
tree_pipe.fit(X_train, y_train)
y_pred_scaled = tree_pipe.predict(X_test_scaled)
f, axes=plt.subplots(1,2,figsize=(20,10))
ConfusionMatrixDisplay.from_predictions(y_test_scaled, y_pred_scaled, display_labels=df_train["class"].unique(), cmap=plt.cm.Blues, ax=axes[0])
count=0
for i in range(0, y_pred_scaled.size):
    if y_pred[i]==y_test[i]:
        count+=1
acc_phone=count/y_pred_scaled.size


X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, shuffle=True, random_state=42)
tree_pipe.fit(X_train, y_train)
y_pred = tree_pipe.predict(X_test)
count=0
for true, predicted in zip(y_test, y_pred):
    if true==predicted:
        count+=1
acc=count/y_pred.size

ax=sns.barplot(x=["DeviceMotion","Phone data"], y=[acc, acc_phone], ax=axes[1])
ax.set(xlabel="Dataset", ylabel="Accuracy")