In [1]:
# patient_data_cleaning.py
# Example: clean a mock patient dataset and produce summary statistics
import pandas as pd

In [2]:
# Create mock data
data = {
    'patient_id': [1,2,3,4,5,6],
    'age': [34, 67, None, 45, 29, 80],
    'bp_systolic': [120, 145, 130, None, 110, 160],
    'bp_diastolic': [80, 90, 85, 88, None, 95],
    'medication_given': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes'],
    'visit_date': ['2024-01-10','2024-01-15','2024-01-20','2024-02-02','2024-02-10','2024-02-11']
}
df = pd.DataFrame(data)
df['visit_date'] = pd.to_datetime(df['visit_date'])


In [3]:
# 1. Overview
print("Original Data:\n", df)


Original Data:
    patient_id   age  bp_systolic  bp_diastolic medication_given visit_date
0           1  34.0        120.0          80.0              Yes 2024-01-10
1           2  67.0        145.0          90.0               No 2024-01-15
2           3   NaN        130.0          85.0              Yes 2024-01-20
3           4  45.0          NaN          88.0              Yes 2024-02-02
4           5  29.0        110.0           NaN               No 2024-02-10
5           6  80.0        160.0          95.0              Yes 2024-02-11


In [4]:
# 2. Clean: fill missing ages with median
df['age'] = df['age'].fillna(df['age'].median())


In [5]:
# 3. Fill blood pressure with forward fill then median
df['bp_systolic'] = df['bp_systolic'].fillna(method='ffill').fillna(df['bp_systolic'].median())
df['bp_diastolic'] = df['bp_diastolic'].fillna(method='ffill').fillna(df['bp_diastolic'].median())


  df['bp_systolic'] = df['bp_systolic'].fillna(method='ffill').fillna(df['bp_systolic'].median())
  df['bp_diastolic'] = df['bp_diastolic'].fillna(method='ffill').fillna(df['bp_diastolic'].median())


In [6]:
# 4. Standardise medication column
df['medication_given'] = df['medication_given'].str.capitalize()

In [7]:
# 4. Standardise medication column
df['medication_given'] = df['medication_given'].str.capitalize()

In [8]:
# 5. Create summary stats
summary = df.describe(include='all')
print("\nCleaned Data:\n", df)
print("\nSummary:\n", summary)


Cleaned Data:
    patient_id   age  bp_systolic  bp_diastolic medication_given visit_date
0           1  34.0        120.0          80.0              Yes 2024-01-10
1           2  67.0        145.0          90.0               No 2024-01-15
2           3  45.0        130.0          85.0              Yes 2024-01-20
3           4  45.0        130.0          88.0              Yes 2024-02-02
4           5  29.0        110.0          88.0               No 2024-02-10
5           6  80.0        160.0          95.0              Yes 2024-02-11

Summary:
         patient_id        age  bp_systolic  bp_diastolic medication_given  \
count     6.000000   6.000000      6.00000      6.000000                6   
unique         NaN        NaN          NaN           NaN                2   
top            NaN        NaN          NaN           NaN              Yes   
freq           NaN        NaN          NaN           NaN                4   
mean      3.500000  50.000000    132.50000     87.666667       

In [9]:
# 6. Export cleaned file
df.to_csv('cleaned_patient_data.csv', index=False)
print('\nCleaned data saved to cleaned_patient_data.csv')


Cleaned data saved to cleaned_patient_data.csv
