In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv("Gait1.csv")
columns_to_analyze = ['AL-X', 'AL-Y', 'AL-Z', 'AG-X', 'AG-Y', 'AG-Z', 'Gravity-X', 'Gravity-Y', 'Gravity-Z', 'RR-X', 'RR-Y', 'RR-Z', 'RV-X', 'RV-Y', 'RV-Z']
dataset

In [None]:
from scipy.ndimage import gaussian_filter

dataset_to_denoise = dataset[columns_to_analyze]
sigma = 1
denoised_dataset_array = dataset_to_denoise.copy().to_numpy()
for i, column_name in enumerate(columns_to_analyze):
    denoised_dataset_array[:, i] = gaussian_filter(dataset_to_denoise[column_name].to_numpy(), sigma=sigma)
denoised_dataset = dataset.copy()
denoised_dataset[columns_to_analyze] = denoised_dataset_array
denoised_dataset
for column_name in columns_to_analyze:
    plt.figure(figsize=(8, 4))
    plt.plot(dataset[column_name], label=f"Noisy {column_name}")
    plt.plot(denoised_dataset[column_name], label=f"Denoised {column_name}")
    plt.title(f"Noisy vs. Denoised {column_name}")
    plt.xlabel("Time")
    plt.ylabel("Value")
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
from sklearn.ensemble import IsolationForest

dataset_to_analyze = dataset[columns_to_analyze]
isolation_forest = IsolationForest(contamination=0.05)
isolation_forest.fit(dataset_to_analyze)
outlier_scores = isolation_forest.score_samples(dataset_to_analyze)
threshold = np.percentile(outlier_scores, 100 * 0.05)
plt.figure(figsize=(8, 4))
plt.plot(outlier_scores, marker='o', linestyle='', label='Outlier Score')
plt.axhline(y=threshold, color='r', linestyle='--', label='Threshold')
plt.title("Isolation Forest Outlier Detection")
plt.xlabel("Sample")
plt.ylabel("Outlier Score")
plt.legend()
plt.grid(True)
plt.show()
potential_outliers_indices = np.where(outlier_scores < threshold)
print("Indices of potential outliers:", potential_outliers_indices)
dataset_cleaned = dataset.drop(dataset.index[potential_outliers_indices])

In [None]:
X = dataset_cleaned.drop(['User'], axis=1)
y = dataset_cleaned.select_dtypes(include=[int])
y.User.unique()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
y = y.apply(le.fit_transform)
y

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import time
time_start = time.perf_counter()
xgb = XGBClassifier()
xgb.fit(X_train, y_train.values.ravel())
time_elapsed = (time.perf_counter() - time_start)
time_elapsed

In [None]:
y_pred = xgb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
report=classification_report(y_test, y_pred)
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)*100
accuracy

In [None]:
from sklearn.model_selection import cross_validate
cv = cross_validate(xgb, X_train, y_train.values.ravel(), cv=5, return_train_score=True)
print(cv['test_score'])
print(cv['test_score'].mean())

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(12,6))
labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold"]
X_axis = np.arange(len(labels))
ax = plt.gca()
plt.ylim(0.40000, 1)
plt.bar(X_axis-0.2, cv['train_score'], 0.4, color='blue', label='Training')
plt.bar(X_axis+0.2, cv['test_score'], 0.4, color='red', label='Validation')
plt.title("Accuracy scores in 5 Folds", fontsize=30)
plt.xticks(X_axis, labels)
plt.xlabel("Extreme Gradient Boost - Gaussian", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()