In [19]:
# import pandas as pd
import numpy as np
# Load raw data
df = pd.read_csv('../data_raw/raw_data.csv')

# Unit standardization: Convert height and weight
df['Height_m'] = df['Height'] * 0.0254  # inches to meters
df['Weight_kg'] = df['Weight'] * 0.45359237  # pounds to kg

# BMI calculation: Vectorized calculation
df['BMI'] = (df['Weight_kg'] / (df['Height_m'] ** 2)).round(2)

# Age Grouping: Using pd.cut for efficient binning
age_bins = [0, 30, 45, 60, float('inf')]
age_labels = ['<30', '30-45', '46-60', '>60']
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)
print(pd.cut(df['Age'], bins=age_bins, labels=age_labels))
# Categorical to numeric encoding
df['Frailty_binary'] = df['Frailty'].map({'Y': 1, 'N': 0}).astype('int8')

# One-hot encoding AgeGroup directly
df = pd.concat([df, pd.get_dummies(df['AgeGroup'], prefix='AgeGroup', dtype=int)], axis=1)

# Drop unnecessary columns
df.drop(columns=['Height', 'Weight', 'Frailty', 'AgeGroup'], inplace=True)

# Save cleaned data
df.to_csv('../data_clean/clean_data.csv', index=False)

# Finishing self note
print(f"Cleaned data saved!")


0      <30
1      <30
2    30-45
3      <30
4      <30
5    46-60
6    46-60
7      <30
8      <30
9    30-45
Name: Age, dtype: category
Categories (4, object): ['<30' < '30-45' < '46-60' < '>60']
Cleaned data saved!


In [6]:
# >> This script handles Stage III: Data Analysis. <<

import pandas as pd

# Load cleaned data
df = pd.read_csv('../data_clean/clean_data.csv')

# Get numeric columns but exclude AgeGroup and Frailty_binary columns
numeric_cols = [col for col in df.select_dtypes(include=['number']).columns if not col.startswith('AgeGroup_') and col != 'Frailty_binary']
print(numeric_cols)
# Calculate mean, median, and standard deviation
summary = df[numeric_cols].agg(['mean', 'median', 'std']).transpose()
print(summary)
# Correlation between grip strength and binary frailty
correlation = df['Grip_strength'].corr(df['Frailty_binary'])

# Save the findings to markdown
with open('../reports/findings.md', 'w') as f:
    f.write('# Summary Statistics\n\n')
    f.write(summary.to_markdown())
    f.write('\n\n# Correlation between Grip Strength and Frailty\n\n')
    f.write(f'The correlation coefficient is {correlation:.4f}.\n')

# Finishing self note
print("Findings saved!")

['Age', 'Grip_strength', 'Height_m', 'Weight_kg', 'BMI']
                    mean     median        std
Age            32.500000  29.500000  12.860361
Grip_strength  26.000000  27.000000   4.521553
Height_m        1.742440   1.738630   0.042435
Weight_kg      59.828834  61.688562   6.455441
BMI            19.682000  19.185000   1.780972
Findings saved!
