### Load features



In [17]:
import pandas as pd
import pathlib
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# Data loading
data_dir = pathlib.Path("../data")
featues = data_dir / "train/ml_feature_matrix.csv"
df = pd.read_csv(featues)

df.head()

Unnamed: 0,subject_id,start_date,measurement_date,target_ffp_status,prev_ffp_status,total_steps_sum,total_steps_mean,total_steps_std,days_exercised,interval_length,longest_active_streak,prop_days_exercised,visit_interval,cumulative_steps,group,sex,age,ethnicity,race,monthly_income,education,marital_status,living_situation
0,MDE001,2021-07-19,2021-08-23,frail,frail,358.0,9.944444,31.88994,9,36,2,0.25,1_to_2,358.0,control,m,83.0,non_hisp,black,3000.0,18.0,never_married,with_others
1,MDE001,2021-08-23,2021-09-15,no_frail,frail,639.0,26.625,87.482576,11,24,5,0.458333,2_to_3,997.0,control,m,83.0,non_hisp,black,3000.0,18.0,never_married,with_others
2,MDE001,2021-09-15,2021-10-07,frail,no_frail,235.0,10.217391,13.70785,10,23,6,0.434783,3_to_4,1232.0,control,m,83.0,non_hisp,black,3000.0,18.0,never_married,with_others
3,MDE001,2021-10-07,2022-01-19,frail,frail,1291.0,13.309278,29.51566,29,97,12,0.298969,4_to_5,2523.0,control,m,83.0,non_hisp,black,3000.0,18.0,never_married,with_others
4,MDE002,2021-07-21,2021-08-30,no_frail,no_frail,0.0,0.0,0.0,0,41,0,0.0,1_to_2,0.0,exercise,m,61.0,non_hisp,white,1033.0,3.0,never_married,alone


### Impute missing values


In [34]:
# Check missing values
total_rows = len(df)

missing_summary = df.isnull().sum().to_frame(name='missing_count')
missing_summary['missing_percent'] = (missing_summary['missing_count'] / total_rows * 100).round(2)
missing_summary = missing_summary[missing_summary['missing_count'] > 0].sort_values(by='missing_count', ascending=False)

print(missing_summary)

Empty DataFrame
Columns: [missing_count, missing_percent]
Index: []


In [32]:
# Fill missing total_steps_std with 0
df['total_steps_std'] = df['total_steps_std'].fillna(0)

# Fill missing monthly_income with median
median_income = df['monthly_income'].median()
df['monthly_income'] = df['monthly_income'].fillna(median_income)

# Fill missing education with mode
df['education'] = df['education'].fillna(df["education"].mode().iloc[0])

### Train


In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, cross_val_score

X = df.drop(['subject_id', 'start_date', 'measurement_date', 'target_ffp_status'], axis=1)
y = LabelEncoder().fit_transform(df['target_ffp_status'])

# Encode categorical features
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Stratified K-Fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_macro_scores = cross_val_score(clf, X, y, cv=skf, scoring='f1_macro')
accuracy_scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')

print("F1 Macro Scores:", f1_macro_scores)
print("Average F1 Macro:", f1_macro_scores.mean())
print("Accuracy Scores:", accuracy_scores)
print("Average Accuracy:", accuracy_scores.mean())

# Per-class performance tracking
reports = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)
    df_report = pd.DataFrame(report).T.loc[['0', '1'], ['precision', 'recall', 'f1-score']]
    df_report['fold'] = fold
    reports.append(df_report)

# Combine and compute averages
final_report_df = pd.concat(reports).reset_index().rename(columns={'index': 'class'})
avg_metrics = final_report_df.groupby('class')[['precision', 'recall', 'f1-score']].mean().round(3)

print("\nAverage Per-Class Metrics Across Folds:")
print(avg_metrics)

F1 Macro Scores: [0.71102151 0.62591093 0.71428571 0.81524927 0.74187884]
Average F1 Macro: 0.7216692517573984
Accuracy Scores: [0.76744186 0.73809524 0.78571429 0.85714286 0.83333333]
Average Accuracy: 0.7963455149501661

Average Per-Class Metrics Across Folds:
       precision  recall  f1-score
class                             
0          0.735   0.491     0.578
1          0.818   0.920     0.865
