### Load features



In [None]:
import datajoint as dj
from activity_tracker.pipeline import subject, models, ingestion
from sklearn.preprocessing import LabelEncoder

import pandas as pd
df = models.Feature.get_feature_matrix(1)
df.drop(columns=['group', 'race', 'total_steps_std', 'interval_length'], inplace=True)
df.head()

### Impute missing values


In [None]:
# Fill missing monthly_income with median
median_income = df['monthly_income'].median()
df['monthly_income'] = df['monthly_income'].fillna(median_income)

# Fill missing education with mode
df['education'] = df['education'].fillna(df["education"].mode().iloc[0])

In [None]:
# Check missing values
total_rows = len(df)
missing_summary = df.isnull().sum().to_frame(name='missing_count')
missing_summary['missing_percent'] = (missing_summary['missing_count'] / total_rows * 100).round(2)
missing_summary = missing_summary[missing_summary['missing_count'] > 0].sort_values(by='missing_count', ascending=False)

print(missing_summary)

## Train


### RandomForest


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupShuffleSplit, cross_val_score

# Prepare features and target
X = df.drop(['subject_id', 'start_date', 'measurement_date', 'target_ffp_status'], axis=1)
le = LabelEncoder()
y = le.fit_transform(df['target_ffp_status'])  # 0: frail, 1: no_frail

# Encode categorical features
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Model
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# FSubject-level splitting to prevent data leakage
gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
f1_macro_scores = cross_val_score(clf, X, y, cv=gss, scoring='f1_macro', groups=df['subject_id'])
accuracy_scores = cross_val_score(clf, X, y, cv=gss, scoring='accuracy', groups=df['subject_id'])

print("Randome Forest: 5-fold cross-validation")
print("-"*50)
print("F1 score:", f1_macro_scores.mean().round(3))
print("Accuracy:", accuracy_scores.mean().round(3))

# Per-class performance tracking with subject-level splitting
reports = []

for fold, (train_idx, test_idx) in enumerate(gss.split(X, y, groups=df['subject_id']), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Verify no subject overlap
    train_subjects = set(df.iloc[train_idx]['subject_id'])
    test_subjects = set(df.iloc[test_idx]['subject_id'])
    overlap = len(train_subjects.intersection(test_subjects))
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Rename class labels: 0 → frail, 1 → no_frail
    df_report = pd.DataFrame(report).T.loc[['0', '1'], ['precision', 'recall', 'f1-score']]
    df_report.index = df_report.index.map({'0': 'frail', '1': 'no_frail'})
    df_report['fold'] = fold
    reports.append(df_report)

final_report_df = pd.concat(reports).reset_index().rename(columns={'index': 'class'})
avg_metrics = final_report_df.groupby('class')[['precision', 'recall', 'f1-score']].mean().round(3)
avg_metrics.loc['average'] = avg_metrics.mean()
print(avg_metrics)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# Fit on all data (or use the last fold's model)
clf.fit(X, y)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X.shape[1]):
    print(f"{f + 1}. {X.columns[indices[f]]} ({importances[indices[f]]:.4f})")

plt.figure(figsize=(10, 6))
plt.title("Feature Importance (from random forest)")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=90)
plt.tight_layout()
sns.despine()
plt.show()

In [None]:
# Correlation matrix
corr = X.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

### XGBoost

In [None]:
df

In [None]:
import xgboost as xgb

clf = xgb.XGBClassifier(
    n_estimators=50,
    max_depth=3,
    random_state=42,
    scale_pos_weight=1,
    # use_label_encoder=False,
    eval_metric='logloss',
    n_jobs=-1 
)

# Subject-level splitting to prevent data leakage
gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
f1_macro_scores = cross_val_score(clf, X, y, cv=gss, scoring='f1_macro', groups=df['subject_id'])
accuracy_scores = cross_val_score(clf, X, y, cv=gss, scoring='accuracy', groups=df['subject_id'])

print("XGBoost - 5-fold cross-validation")
print("-"*50)
print("F1 score:", f1_macro_scores.mean().round(3))
print("Accuracy:", accuracy_scores.mean().round(3))

reports = []
for fold, (train_idx, test_idx) in enumerate(gss.split(X, y, groups=df['subject_id']), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # Verify no subject overlap
    train_subjects = set(df.iloc[train_idx]['subject_id'])
    test_subjects = set(df.iloc[test_idx]['subject_id'])
    overlap = len(train_subjects.intersection(test_subjects))

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    report = classification_report(y_test, y_pred, output_dict=True)
    df_report = pd.DataFrame(report).T.loc[['0', '1'], ['precision', 'recall', 'f1-score']]
    df_report.index = df_report.index.map({'0': 'frail', '1': 'no_frail'})
    df_report['fold'] = fold
    reports.append(df_report)

final_report_df = pd.concat(reports).reset_index().rename(columns={'index': 'class'})
avg_metrics = final_report_df.groupby('class')[['precision', 'recall', 'f1-score']].mean().round(3)
avg_metrics.loc['average'] = avg_metrics.mean()
print(avg_metrics)

In [None]:
from sklearn.feature_selection import SelectFromModel

# Fit model
clf = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
clf.fit(X, y)

# Automatic feature selection
selector = SelectFromModel(clf, threshold="median", prefit=True)
X_selected = selector.transform(X)
selected_features = X.columns[selector.get_support()]

In [None]:
selected_features