In [None]:
import warnings

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, classification_report,
                             confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc)
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import label_binarize
from sklearn.tree import DecisionTreeClassifier

In [None]:
warnings.filterwarnings('ignore')

In [None]:
file_path = '01_Data_Processed.csv'
df = pd.read_csv(file_path)
print(df.head())

In [None]:
df['epoch (ms)'] = pd.to_datetime(df['epoch (ms)'], errors='coerce')
print(df.head())
# ============================== Plotting all values ==============================

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['Accelerometer_x'], bins=50, ax=axes[0], color='r').set_title('Accelerometer X')
sns.histplot(df['Accelerometer_y'], bins=50, ax=axes[1], color='g').set_title('Accelerometer Y')
sns.histplot(df['Accelerometer_z'], bins=50, ax=axes[2], color='b').set_title('Accelerometer Z')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['Gyroscope_x'], bins=50, ax=axes[0], color='r').set_title('Gyroscope X')
sns.histplot(df['Gyroscope_y'], bins=50, ax=axes[1], color='g').set_title('Gyroscope Y')
sns.histplot(df['Gyroscope_z'], bins=50, ax=axes[2], color='b').set_title('Gyroscope Z')
plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.histplot(df['Participants'], bins=50, ax=axes[0], color='r').set_title('Participants')
sns.histplot(df['Label'], bins=50, ax=axes[1], color='g').set_title('Label')
sns.histplot(df['Category'], bins=50, ax=axes[2], color='b').set_title('Category')
plt.tight_layout()
plt.show()

In [None]:
plt.hist(df['Set'], bins=50, color='b')
plt.title("Set")
plt.show()

============================== CORELATION MATRIX ==============================

In [None]:
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()

Plotting the heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
print("############################# Random Forest ##################################")

Feature and label selection

In [None]:
X = df[['Accelerometer_x', 'Accelerometer_y', 'Accelerometer_z', 'Gyroscope_x', 'Gyroscope_y', 'Gyroscope_z']]
y = df['Label']

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train the Random Forest model

In [None]:
Random_forest_model = RandomForestClassifier(n_estimators=10, random_state=10)
Random_forest_model.fit(X_train, y_train)

Predict and evaluate

In [None]:
y_pred = Random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy pierwsza próba: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (Random Forest)')
plt.show()

ROC Curve for Multiclass

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

Predictions and probability estimates

In [None]:
y_prob = Random_forest_model.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Random Forest')
plt.legend(loc="lower right")
plt.show()

Cross-validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(Random_forest_model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (Random Forest): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')

Train Random Forest model with more estimators

In [None]:
Random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
Random_forest_model.fit(X_train, y_train)

Predict and evaluate

In [None]:
y_pred = Random_forest_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix for n_estimators=100

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (Random Forest n_estimators=100)')
plt.show()

ROC Curve for Multiclass (Random Forest with n_estimators=100)

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

Predictions and probability estimates

In [None]:
y_prob = Random_forest_model.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves for the Random Forest model with n_estimators=100

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Random Forest n_estimators=100')
plt.legend(loc="lower right")
plt.show()

Cross-validation for n_estimators=100

In [None]:
scores = cross_val_score(Random_forest_model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (Random Forest n_estimators=100): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')

In [None]:
print("############################# LOGISTIC REGRESSION ##################################")

Train logistic regression model

In [None]:
Logistic_regression_model = LogisticRegression()
Logistic_regression_model.fit(X_train, y_train)

Predictions

In [None]:
y_pred = Logistic_regression_model.predict(X_test)

Accuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (Logistic Regression)')
plt.show()

ROC Curve for Multiclass

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

Predictions and probability estimates

In [None]:
y_prob = Logistic_regression_model.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Multiclass Logistic Regression')
plt.legend(loc="lower right")
plt.show()

Cross-validation

In [None]:
scores = cross_val_score(Logistic_regression_model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (Logistic Regression): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')

Train OVR model

In [None]:
Logistic_regression_model_ovr = LogisticRegression(multi_class='ovr', solver='lbfgs', random_state=100)
Logistic_regression_model_ovr.fit(X_train, y_train)

Predictions

In [None]:
y_pred_ovr = Logistic_regression_model_ovr.predict(X_test)
accuracy_ovr = accuracy_score(y_test, y_pred_ovr)
print(f'Accuracy (OVR): {accuracy_ovr:.2f}')
print(classification_report(y_test, y_pred_ovr))

Confusion Matrix for OVR

In [None]:
cm_ovr = confusion_matrix(y_test, y_pred_ovr)
disp_ovr = ConfusionMatrixDisplay(confusion_matrix=cm_ovr)
disp_ovr.plot()
plt.title('Confusion Matrix (Logistic Regression OVR)')
plt.show()

ROC Curve for Multiclass (Logistic Regression OVR)

In [None]:
y_prob_ovr = Logistic_regression_model_ovr.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr_ovr = dict()
tpr_ovr = dict()
roc_auc_ovr = dict()

In [None]:
for i in range(n_classes):
    fpr_ovr[i], tpr_ovr[i], _ = roc_curve(y_test_bin[:, i], y_prob_ovr[:, i])
    roc_auc_ovr[i] = auc(fpr_ovr[i], tpr_ovr[i])

Plot all ROC curves for the Logistic Regression OVR model

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr_ovr[i], tpr_ovr[i], label=f'Class {i} ROC curve (area = {roc_auc_ovr[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Logistic Regression OVR')
plt.legend(loc="lower right")
plt.show()

Cross-validation for OVR

In [None]:
scores_ovr = cross_val_score(Logistic_regression_model_ovr, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (Logistic Regression OVR): {scores_ovr}')
print(f'Srednia dokladnosc cross-validation OVR: {np.mean(scores_ovr):.2f}')

In [None]:
print("############################# KNN ##################################")

Feature and label selection

In [None]:
X = df[['Accelerometer_x', 'Accelerometer_y', 'Accelerometer_z', 'Gyroscope_x', 'Gyroscope_y', 'Gyroscope_z']]
y = df['Label']

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train KNN model

In [None]:
KNN_model = KNeighborsClassifier()
KNN_model.fit(X_train, y_train)

Predict and evaluate

In [None]:
y_pred = KNN_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (KNN)')
plt.show()

ROC Curve for Multiclass

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

Predictions and probability estimates

In [None]:
y_prob = KNN_model.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - KNN')
plt.legend(loc="lower right")
plt.show()

Cross-validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(KNN_model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (KNN): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')

Train KNN model with n_neighbors=1

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=1)
KNN_model.fit(X_train, y_train)

Predict and evaluate

In [None]:
y_pred = KNN_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (n_neighbors=1): {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix for n_neighbors=1

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (KNN n_neighbors=1)')
plt.show()

ROC Curve for Multiclass (KNN with n_neighbors=1)

In [None]:
y_prob = KNN_model.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves for the KNN model with n_neighbors=1

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - KNN n_neighbors=1')
plt.legend(loc="lower right")
plt.show()

Cross-validation for n_neighbors=1

In [None]:
scores = cross_val_score(KNN_model, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (KNN n_neighbors=1): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')

In [None]:
print("############################# Decision Tree Classifier ##################################")

Feature and label selection

In [None]:
X = df[['Accelerometer_x', 'Accelerometer_y', 'Accelerometer_z', 'Gyroscope_x', 'Gyroscope_y', 'Gyroscope_z']]
y = df['Label']

Split the data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Train Decision Tree model (max_depth=1)

In [None]:
Decision_tree = DecisionTreeClassifier(max_depth=1)
Decision_tree.fit(X_train, y_train)

Predict and evaluate

In [None]:
y_pred = Decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (max_depth=1): {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (Decision Tree max_depth=1)')
plt.show()

ROC Curve for Multiclass

In [None]:
y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
n_classes = y_test_bin.shape[1]

Predictions and probability estimates

In [None]:
y_prob = Decision_tree.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Decision Tree (max_depth=1)')
plt.legend(loc="lower right")
plt.show()

Cross-validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(Decision_tree, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (Decision Tree max_depth=1): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')

Train Decision Tree model

In [None]:
Decision_tree = DecisionTreeClassifier(max_depth=20, random_state=42, min_samples_split=5)
Decision_tree.fit(X_train, y_train)

Predict and evaluate

In [None]:
y_pred = Decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy (max_depth=20): {accuracy:.2f}')
print(classification_report(y_test, y_pred))

Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title('Confusion Matrix (Decision Tree max_depth=20)')
plt.show()

ROC Curve

In [None]:
y_prob = Decision_tree.predict_proba(X_test)

Compute ROC curve and ROC area for each class

In [None]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

Plot all ROC curves

In [None]:
plt.figure()
for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'Class {i} ROC curve (area = {roc_auc[i]:.2f})')

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) - Decision Tree (max_depth=20)')
plt.legend(loc="lower right")
plt.show()

Cross-validation

In [None]:
scores = cross_val_score(Decision_tree, X, y, cv=cv, scoring='accuracy')
print(f'Cross-validation scores (Decision Tree max_depth=20): {scores}')
print(f'Srednia dokladnosc cross-validation: {np.mean(scores):.2f}')