### Loading the data

In [1]:
'''Reading the tabular data'''
import pickle
import gzip
import shutil
import os
import yaml
import warnings
import pandas as pd
import numpy as np


def load_pickle_from_parts(parts_dir):
    # Combine the parts into a single compressed file
    combined_path = os.path.join(parts_dir, 'features_20s.gz')
    with open(combined_path, 'wb') as combined_file:
        part_num = 0
        while True:
            part_path = os.path.join(parts_dir, f'features_20s_part_{part_num:03d}')
            if not os.path.exists(part_path):
                break
            with open(part_path, 'rb') as part_file:
                shutil.copyfileobj(part_file, combined_file)
            part_num += 1
    
    # Decompress the combined file and load the pickle data
    with gzip.open(combined_path, 'rb') as f_in:
        data = pickle.load(f_in)
    
    # Optionally remove the combined file after loading
    os.remove(combined_path)
    
    return data

# Example usage
current_dir = os.getcwd()  # Use the current working directory
parent_dir = os.path.dirname(current_dir)
parts_dir = os.path.join(parent_dir, 'data')
data_dict = load_pickle_from_parts(parts_dir)


In [2]:
'''Loading the data'''

# Name of the devices to include in the pipeline
devices = [
    'corsano_wrist',
    'cosinuss_ear',
    'sensomative_back',
    'sensomative_bottom',
    'vivalink_patch',
    'zurichmove_wheel'
]
n_devices = len(devices)

# Load parameters from the yaml file
# Get the current directory
current_dir = os.getcwd()  # Use the current working directory
# Get the parent directory
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
# Construct the path to the yaml file
yaml_file_path = os.path.join(parent_dir, 'parameters.yaml')
# Load the yaml file
with open(yaml_file_path, 'r') as f:
    params = yaml.safe_load(f)

# Accessing the parameters
seed_number = params['seed_number']
upsample_freq = params['upsample_freq']
activities_label_mapping = params['activities_label_mapping']

'''Loading the data'''
# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning, message='overflow encountered in cast')


# Converting the dict data to a dataframe
dfs = []
subjects = []
for i_subject, subject in enumerate(data_dict.keys()):
    df1 = data_dict[subject]['corsano_wrist']
    df2 = data_dict[subject]['cosinuss_ear']
    df3 = data_dict[subject]['sensomative_back']
    df4 = data_dict[subject]['sensomative_bottom']
    df5 = data_dict[subject]['vivalink_patch']
    df6 = data_dict[subject]['zurichmove_wheel']
    df7 = data_dict[subject]['label']
    df = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=1)
    df['subject'] = i_subject
    subjects.append(subject)
    dfs.append(df)
data_df = pd.concat(dfs)

'''Converting the dataframe to float32 except for the 'label' and 'subject' columns'''
data_df = data_df.astype('float32')
data_df[['label', 'subject']] = data_df[['label', 'subject']].astype(int)

'''Drop columns containing inf and nan values'''
data_df = data_df.replace([np.inf, -np.inf], np.nan).dropna(axis=1, how='any')

'''Updating the list of features for each device'''
device_columns = []
for device in devices:
    columns = []
    for column in data_df.columns:
        if device in column:
            columns.append(column)
    device_columns.append(columns)

# Convert the pandas df to cudf
# data_df = cudf.DataFrame.from_pandas(data_df)

# Reporting the data imbalance
X = np.array(data_df.drop(['label', 'subject'], axis=1, inplace=False))
y = np.array(data_df['label'])
subjects = np.array(data_df['subject'])
# Get unique classes and their counts
classes, counts = np.unique(y, return_counts=True)
# Create a report table
report_table = np.vstack((classes, list(activities_label_mapping.keys()), counts)).T
print("Class | Activity         | Count")
print("--------------------------------")
for row in report_table:
    print(f"{row[0]:<5} | {row[1]:<16} | {row[2]:<5}")


Class | Activity         | Count
--------------------------------
0     | calmness         | 3646 
1     | selfpropulsion   | 1760 
2     | armraises        | 1022 
3     | transfer         | 2018 
4     | usingphone       | 1786 
5     | talking          | 2524 
6     | washhands        | 2246 
7     | eating           | 1618 
8     | assistedprop     | 1534 
9     | usingcomputer    | 2494 
10    | changingclothes  | 1126 
11    | pressurerelief   | 1398 


### Performing nested cross validation (cv splits done by subject)

In [8]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif, VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

# Custom transformer for clipping
class ClippingTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, lower_percentile=1, upper_percentile=99):
        self.lower_percentile = lower_percentile
        self.upper_percentile = upper_percentile

    def fit(self, X, y=None):
        self.lower_bounds = np.percentile(X, self.lower_percentile, axis=0)
        self.upper_bounds = np.percentile(X, self.upper_percentile, axis=0)
        return self

    def transform(self, X, y=None):
        return np.clip(X, self.lower_bounds, self.upper_bounds)

# Define the subject-based k-fold splitting function
def subject_kfold(subjects, subject_col, n_test_subjects):
    splits = []
    num_folds = len(subjects) // n_test_subjects
    test_folds = []
    subjects_copy = subjects.copy()
    for i in range(num_folds):
        test_fold = random.sample(subjects_copy, n_test_subjects)
        test_folds.append(test_fold)
        subjects_copy = [elem for elem in subjects_copy if elem not in test_fold]
    test_folds[-1].extend(subjects_copy)
    for test_fold in test_folds:
        train_fold = [elem for elem in subjects if elem not in test_fold]
        test_indices = np.where(np.isin(subject_col, test_fold))[0]
        train_indices = np.where(np.isin(subject_col, train_fold))[0]
        splits.append((train_indices, test_indices))
    return splits, test_folds

# Assuming your DataFrame is named 'data_df' and it has columns 'subject', 'label', and other features
# Separate features, labels, and groups (subjects)
X = data_df.drop(columns=['subject', 'label'])
y = data_df['label']
subject_col = data_df['subject']
subjects = list(set(subject_col))

# Handle missing values
X = X.fillna(X.mean())

# Number of subjects to be used in the test and valdation sets for each fold
n_test_subjects = len(subjects) // 5
n_val_subjects = (len(subjects) - n_test_subjects) // 5

# Define the pipeline (GPU)
pipeline = Pipeline([
    ('remove_constant', VarianceThreshold(threshold=0.0)),
    ('clipper', ClippingTransformer(lower_percentile=1, upper_percentile=99)),
    ('scaler', RobustScaler()),
    ('selector', SelectKBest(score_func=mutual_info_classif)),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss', tree_method = "hist", device = "cuda"))
])

# # Define the pipeline (CPU)
# pipeline = Pipeline([
#     ('remove_constant', VarianceThreshold(threshold=0.0)),
#     ('clipper', ClippingTransformer(lower_percentile=1, upper_percentile=99)),
#     ('scaler', RobustScaler()),
#     ('selector', SelectKBest(score_func=mutual_info_classif)),
#     ('classifier', XGBClassifier(random_state=42, xgb_booster='gbtree'))
# ])

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'selector__k': [500],
    'classifier__n_estimators': [150],
    'classifier__max_depth': [10],
    'classifier__learning_rate': [0.01],
    'classifier__subsample': [0.8],
    'classifier__colsample_bytree': [1.0],
    'classifier__gamma': [0.3]
}

outer_scores = []
all_predictions = []
all_true_labels = []

# Perform nested cross-validation using subject-based splits
cv_outer, test_folds = subject_kfold(subjects, subject_col, n_test_subjects)

for i_test_fold in range(len(test_folds)):
    train_ix, test_ix = cv_outer[i_test_fold]
    X_train, X_test = X.iloc[train_ix], X.iloc[test_ix]
    y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
    subject_col_train = subject_col.iloc[train_ix]
    train_subjects = [elem for elem in subjects if elem not in test_folds[i_test_fold]]

    # Inner cross-validation
    cv_inner, valid_folds = subject_kfold(train_subjects, subject_col_train, n_val_subjects)
    inner_cv_splits = [(train_inner_ix, val_inner_ix) for train_inner_ix, val_inner_ix in cv_inner]

    # RandomizedSearchCV for hyperparameter tuning
    random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_grid, cv=inner_cv_splits, n_iter=1, random_state=42, refit=True, scoring='f1_micro', n_jobs=-1)
    random_search.fit(X_train, y_train)

    # Evaluate the best model found by RandomizedSearchCV on the test set
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Store results
    all_predictions.extend(y_pred)
    all_true_labels.extend(y_test)

# Calculate metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision = precision_score(all_true_labels, all_predictions, average='weighted')
recall = recall_score(all_true_labels, all_predictions, average='weighted')
f1 = f1_score(all_true_labels, all_predictions, average='weighted')
balanced_acc = balanced_accuracy_score(all_true_labels, all_predictions)
conf_matrix = confusion_matrix(all_true_labels, all_predictions)
conf_matrix_df = pd.DataFrame(conf_matrix, index=range(len(conf_matrix)), columns=range(len(conf_matrix)))
class_report = classification_report(all_true_labels, all_predictions)
report = classification_report(all_true_labels, all_predictions, output_dict=True)
report_df = pd.DataFrame(report).transpose()

# Display metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'Balanced Accuracy'],
    'Value': [accuracy, precision, recall, f1, balanced_acc]
})

# Print metrics
print("Classification Metrics:")
print(metrics_df)

print("\nClassification Report:\n", class_report)

# Plot confusion matrix heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_df, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Plot classification metrics
metrics = ['precision', 'recall', 'f1-score']
report_df = report_df[:-3]  # Remove the last three rows: 'accuracy', 'macro avg', 'weighted avg'

fig, ax = plt.subplots(1, 3, figsize=(20, 5))

for idx, metric in enumerate(metrics):
    sns.barplot(x=report_df.index, y=report_df[metric], ax=ax[idx])
    ax[idx].set_title(f'{metric.capitalize()} by Class')
    ax[idx].set_xlabel('Class')
    ax[idx].set_ylabel(metric.capitalize())
    ax[idx].set_ylim(0, 1)

plt.tight_layout()
plt.show()
