# Problem 1: Frailty

In [13]:
# importing libraries
import pandas as pd
import numpy as np
import os

## Data Ingestion

In [14]:
fname = 'frailty_data.csv'
df = pd.read_csv(fname)
df.head(10)

Unnamed: 0,Height_in,Weight_lb,Age_yr,Grip_kg,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


## Data Preprocessing

In [15]:
# unit standardization
df['Height_m'] = df['Height_in'] * 0.0254
df['Weight_kg'] = df['Weight_lb'] * 0.45359237
df[['Height_in','Height_m','Weight_lb','Weight_kg']].head(10)

Unnamed: 0,Height_in,Height_m,Weight_lb,Weight_kg
0,65.8,1.67132,112,50.802345
1,71.5,1.8161,136,61.688562
2,69.4,1.76276,153,69.399633
3,68.2,1.73228,142,64.410117
4,67.8,1.72212,144,65.317301
5,68.7,1.74498,123,55.791862
6,69.8,1.77292,141,63.956524
7,70.1,1.78054,136,61.688562
8,67.9,1.72466,112,50.802345
9,66.8,1.69672,120,54.431084


In [16]:
# feature engineering
df['BMI'] = (df['Weight_kg'] / (df['Height_m'] ** 2)).round(2)
df['AgeGroup'] = df['Age_yr'].apply(lambda a: '<30' if a < 30 else ('30-45' if a <= 45 else ('46-60' if a <= 60 else '>60')))
df[['Age_yr','AgeGroup','BMI']].head(10)

Unnamed: 0,Age_yr,AgeGroup,BMI
0,30,30-45,18.19
1,19,<30,18.7
2,45,30-45,22.33
3,22,<30,21.46
4,29,<30,22.02
5,50,46-60,18.32
6,51,46-60,20.35
7,23,<30,19.46
8,17,<30,17.08
9,39,30-45,18.91


In [17]:
# categorical to numeric encoding
df['Frailty_binary'] = df['Frailty'].map({'Y': 1, 'N': 0}).astype('int8')
age_dummies = pd.get_dummies(df['AgeGroup'], prefix='AgeGroup').astype('Int8')
expected_cols = ['AgeGroup_<30', 'AgeGroup_30-45', 'AgeGroup_46-60', 'AgeGroup_>60']
for col in expected_cols:
    if col not in age_dummies.columns:
        age_dummies[col] = pd.Series(pd.NA, index=df.index, dtype='object')
age_dummies = age_dummies[expected_cols]
df = df.drop(columns=[c for c in df.columns if c in expected_cols], errors='ignore')
df = pd.concat([df, age_dummies], axis=1)
df.head(10)

Unnamed: 0,Height_in,Weight_lb,Age_yr,Grip_kg,Frailty,Height_m,Weight_kg,BMI,AgeGroup,Frailty_binary,AgeGroup_<30,AgeGroup_30-45,AgeGroup_46-60,AgeGroup_>60
0,65.8,112,30,30,N,1.67132,50.802345,18.19,30-45,0,0,1,0,
1,71.5,136,19,31,N,1.8161,61.688562,18.7,<30,0,1,0,0,
2,69.4,153,45,29,N,1.76276,69.399633,22.33,30-45,0,0,1,0,
3,68.2,142,22,28,Y,1.73228,64.410117,21.46,<30,1,1,0,0,
4,67.8,144,29,24,Y,1.72212,65.317301,22.02,<30,1,1,0,0,
5,68.7,123,50,26,N,1.74498,55.791862,18.32,46-60,0,0,0,1,
6,69.8,141,51,22,Y,1.77292,63.956524,20.35,46-60,1,0,0,1,
7,70.1,136,23,20,Y,1.78054,61.688562,19.46,<30,1,1,0,0,
8,67.9,112,17,19,N,1.72466,50.802345,17.08,<30,0,1,0,0,
9,66.8,120,39,31,N,1.69672,54.431084,18.91,30-45,0,0,1,0,


## Data Analysis


In [18]:
# EDA and reporting
numeric_cols = df.select_dtypes(include=[np.number]).columns
summary_tbl = df[numeric_cols].agg(['mean','median','std']).T
corr_val = df['Grip_kg'].corr(df['Frailty_binary'])
display(summary_tbl)
print('Correlation:', round(corr_val, 4))

Unnamed: 0,mean,median,std
Height_in,68.6,68.45,1.670662
Weight_lb,131.9,136.0,14.231811
Age_yr,32.5,29.5,12.860361
Grip_kg,26.0,27.0,4.521553
Height_m,1.74244,1.73863,0.042435
Weight_kg,59.828834,61.688562,6.455441
BMI,19.682,19.185,1.780972
Frailty_binary,0.4,0.0,0.516398
AgeGroup_<30,0.5,0.5,0.527046
AgeGroup_30-45,0.3,0.0,0.483046


Correlation: -0.4759


In [19]:
# outputting processed csv and findings
df.to_csv('frailty_data_processed.csv', index=False)
with open('findings.md', 'w', encoding='utf-8') as f:
    f.write('# Problem 1: Findings\n\n')
    f.write('## Summary Statistics\n\n')
    f.write(summary_tbl.to_markdown())
    f.write('\n\n## Grip vs Frailty Correlation\n\n')
    f.write(f'Pearson correlation between Grip_kg and Frailty_binary is {corr_val:.4f}\n')
print('Files saved successfully.')

Files saved successfully.
