In [87]:
# Import needed libraires
import pandas as pd
import numpy as np

In [88]:
# Made the data into a csv, uploaded it to GitHub, and imported it
frailty = pd.read_csv('https://raw.githubusercontent.com/RoobyDoobyDoo/CS5530-Assign1/refs/heads/main/Frailty/Frailty.csv')

# Clean column names of hidden characters or whitespace
frailty.columns = frailty.columns.str.strip()

# Also clean all columns in the df
for col in frailty.columns:
    if frailty[col].dtype == 'object':
        frailty[col] = frailty[col].str.strip()

frailty

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


In [89]:
# Make sure the proper data is converted to numerical form
numcol = ['Height', 'Weight', 'Age', 'Grip strength']
for col in numcol:
    frailty[col] = pd.to_numeric(frailty[col], errors='coerce')
frailty.dtypes

Unnamed: 0,0
Height,float64
Weight,int64
Age,int64
Grip strength,int64
Frailty,object


In [90]:
# Rename the columns for clarity
frailty.rename(columns={'Height': 'Height(in)',
                        'Weight': 'Weight(lb)',
                        'Grip strength': 'Grip(kg)'},
               inplace=True)
frailty

Unnamed: 0,Height(in),Weight(lb),Age,Grip(kg),Frailty
0,65.8,112,30,30,N
1,71.5,136,19,31,N
2,69.4,153,45,29,N
3,68.2,142,22,28,Y
4,67.8,144,29,24,Y
5,68.7,123,50,26,N
6,69.8,141,51,22,Y
7,70.1,136,23,20,Y
8,67.9,112,17,19,N
9,66.8,120,39,31,N


# Unit Standardization

In [91]:
# Convert to metric and drop old columns
# Keep it to 3 sig figs
frailty['Height(m)'] = (frailty['Height(in)'] * 0.0254).round(2)
frailty['Weight(kg)'] = (frailty['Weight(lb)'] * 0.45359237).round(1)

frailty = frailty.drop(['Height(in)', 'Weight(lb)'], axis=1)

frailty

Unnamed: 0,Age,Grip(kg),Frailty,Height(m),Weight(kg)
0,30,30,N,1.67,50.8
1,19,31,N,1.82,61.7
2,45,29,N,1.76,69.4
3,22,28,Y,1.73,64.4
4,29,24,Y,1.72,65.3
5,50,26,N,1.74,55.8
6,51,22,Y,1.77,64.0
7,23,20,Y,1.78,61.7
8,17,19,N,1.72,50.8
9,39,31,N,1.7,54.4


# Feature Engineering

In [92]:
# Calculate BMI, break age into categories and drop old columns

frailty['BMI'] = (frailty['Weight(kg)'] / (frailty['Height(m)'] ** 2)).round(2)

# pd.cut will break a given column into bins with given dividers
frailty['Age_Group'] = pd.cut(frailty['Age'],
                              bins=[0, 29, 46, 61, 100],
                              labels=['<30', '30-45', '46-60', '>60']
                              )

# I decided to keep them in case they were needed for the last step
# frailty = frailty.drop(['Weight(kg)', 'Height(m)', 'Age'], axis=1)

frailty

Unnamed: 0,Age,Grip(kg),Frailty,Height(m),Weight(kg),BMI,Age_Group
0,30,30,N,1.67,50.8,18.22,30-45
1,19,31,N,1.82,61.7,18.63,<30
2,45,29,N,1.76,69.4,22.4,30-45
3,22,28,Y,1.73,64.4,21.52,<30
4,29,24,Y,1.72,65.3,22.07,<30
5,50,26,N,1.74,55.8,18.43,46-60
6,51,22,Y,1.77,64.0,20.43,46-60
7,23,20,Y,1.78,61.7,19.47,<30
8,17,19,N,1.72,50.8,17.17,<30
9,39,31,N,1.7,54.4,18.82,30-45


# Categorical --> Numerical Encoding

In [93]:
# Frailty values get mapped to binary values as requested
# Use Int8 to save space on storage
frailty['Frailty_binary'] = frailty['Frailty'].map({'Y': 1, 'N': 0}).astype('Int8')

# Use pd.get_dummies to one hot encode the agegroup categories automatically
# It will return true/false by default so make sure to get int values
# Prefix default separator is already "_"
frailty = frailty.join(pd.get_dummies(frailty['Age_Group'],
                                      prefix='AgeGroup',
                                      dtype=int
                                      ).astype('Int8'))

# Not needed for last step, so we can drop
frailty = frailty.drop(['Frailty','Age_Group'], axis=1)

frailty

Unnamed: 0,Age,Grip(kg),Height(m),Weight(kg),BMI,Frailty_binary,AgeGroup_<30,AgeGroup_30-45,AgeGroup_46-60,AgeGroup_>60
0,30,30,1.67,50.8,18.22,0,0,1,0,0
1,19,31,1.82,61.7,18.63,0,1,0,0,0
2,45,29,1.76,69.4,22.4,0,0,1,0,0
3,22,28,1.73,64.4,21.52,1,1,0,0,0
4,29,24,1.72,65.3,22.07,1,1,0,0,0
5,50,26,1.74,55.8,18.43,0,0,0,1,0
6,51,22,1.77,64.0,20.43,1,0,0,1,0
7,23,20,1.78,61.7,19.47,1,1,0,0,0
8,17,19,1.72,50.8,17.17,0,1,0,0,0
9,39,31,1.7,54.4,18.82,0,0,1,0,0


# EDA & Reporting

In [94]:
# Use .describe() to get our numeric column summary
# Use .to_markdown to convert to formatted plain text
mdsum = frailty.describe().to_markdown()

# Save to file
with open('reports.md', 'w') as f:
    f.write(mdsum)