In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv('1_patient_data.csv')
# 1. Calculate the number and percentage of patients who stayed > 7 days
# create 'RiskLevel' column, if `DaysInHospital` > 7 days, mark as 'high', else 'low'.
data['RiskLevel'] = np.where(data['DaysInHospital']>7, "HighRiskPatients", "LowRiskPatients")
risk_counts = data['RiskLevel'].value_counts()

In [3]:
high_risk_ratio = risk_counts['HighRiskPatients'] / len(data)
low_risk_ratio = risk_counts['LowRiskPatients'] / len(data)
print("Number of high-risk patients:", risk_counts['HighRiskPatients'])
print("Number of low-risk patients:", risk_counts['LowRiskPatients'])
print("Percentage of high-risk patients:", high_risk_ratio)
print("Percentage of low-risk patients:", low_risk_ratio)

Number of high-risk patients: 413
Number of low-risk patients: 587
Percentage of high-risk patients: 0.413
Percentage of low-risk patients: 0.587


In [4]:
# 2. Calculate patient numbers and high-risk patient percentages across different BMI ranges
bmi_bins = [0, 18.5, 24, 28, np.inf]
bmi_labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
data['BMIRange']=pd.cut(data['BMI'], bins=bmi_bins, labels=bmi_labels, right=False)
bmi_risk_rate = data.groupby('BMIRange', observed=False)['RiskLevel'].apply(lambda x: (x=='HighRiskPatients').mean())
bmi_patient_count = data['BMIRange'].value_counts()
print("High-risk patient percentage and count across BMI ranges:", bmi_risk_rate, bmi_patient_count)

High-risk patient percentage and count across BMI ranges: BMIRange
Underweight    0.444444
Normal         0.406699
Overweight     0.388235
Obese          0.415094
Name: RiskLevel, dtype: float64 BMIRange
Obese          477
Normal         209
Overweight     170
Underweight    144
Name: count, dtype: int64


In [5]:
# 3. Calculate patient numbers and high-risk patient percentages by age group
age_bins = [0, 26, 36, 46, 56, 66, np.inf]
age_labels = ['<=25', '26-35', '36-45', '46-55', '56-65', '>65']
data['AgeRange']=pd.cut(data['Age'], bins=age_bins, labels=age_labels, right=False)
age_risk_rate = data.groupby('AgeRange', observed=False)['RiskLevel'].apply(lambda x: (x=='HighRiskPatients').mean())
age_patient_count=data['AgeRange'].value_counts()
print('High-risk patient percentage and count across Age ranges:', age_risk_rate, age_patient_count)

High-risk patient percentage and count across Age ranges: AgeRange
<=25     0.456693
26-35    0.398496
36-45    0.386364
46-55    0.444444
56-65    0.401575
>65      0.401254
Name: RiskLevel, dtype: float64 AgeRange
>65      319
46-55    162
26-35    133
36-45    132
<=25     127
56-65    127
Name: count, dtype: int64
