# Assignment 5, Question 5: Missing Data Analysis

**Points: 15**

Apply and compare different missing data strategies on the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities from Q3
from q3_data_utils import load_data, detect_missing, fill_missing

# Load the data
df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization function for missing data
def visualize_missing_data(missing_counts):
    """
    Create a bar chart of missing values by column.
    
    Args:
        missing_counts: pandas Series with missing value counts per column
    """
    plt.figure(figsize=(10, 6))
    missing_counts.plot(kind='bar')
    plt.title('Missing Values by Column')
    plt.xticks(rotation=45)
    plt.ylabel('Number of Missing Values')
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Detect Missing Data (3 points)

1. Use the `detect_missing()` utility to find missing values
2. Visualize missing data with a bar plot
3. Calculate the percentage of missing values per column

In [2]:
# TODO: Detect and analyze missing data
# 1. Use detect_missing(df) to get missing value counts
# 2. Calculate percentage of missing values per column  
# 3. Print both counts and percentages
# 4. Identify which columns have missing data
missing_counts = detect_missing(df)
missing_percentages = (missing_counts / len(df)) * 100
print("Missing Value Counts:\n", missing_counts)
print("\nMissing Value Percentages:\n", missing_percentages)

# Optional: Use the visualization function above to create a bar chart
# visualize_missing_data(missing_counts)


Missing Value Counts:
 patient_id               0
age                      0
sex                      0
bmi                    438
enrollment_date          0
systolic_bp            414
diastolic_bp           414
cholesterol_total      554
cholesterol_hdl        554
cholesterol_ldl        554
glucose_fasting        369
site                     0
intervention_group       0
follow_up_months         0
adverse_events           0
outcome_cvd              0
adherence_pct         1467
dropout                  0
dtype: int64

Missing Value Percentages:
 patient_id             0.00
age                    0.00
sex                    0.00
bmi                    4.38
enrollment_date        0.00
systolic_bp            4.14
diastolic_bp           4.14
cholesterol_total      5.54
cholesterol_hdl        5.54
cholesterol_ldl        5.54
glucose_fasting        3.69
site                   0.00
intervention_group     0.00
follow_up_months       0.00
adverse_events         0.00
outcome_cvd            0.00
a

## Part 2: Compare Imputation Strategies (6 points)

For the 'cholesterol_total' column (which has missing values):

1. Fill with mean using `fill_missing()` utility
2. Fill with median using `fill_missing()` utility  
3. Forward fill using pandas `.fillna(method='ffill')`
4. Compare the three strategies - create a summary table showing:
   - Original mean/median
   - Mean/median after each strategy
   - How many values were filled

In [3]:
# TODO: Compare imputation strategies
cholesterol_mean = fill_missing(df, 'cholesterol_total', strategy='mean')
cholesterol_median = fill_missing(df, 'cholesterol_total', strategy='median')
cholesterol_ffill = df.fillna(method='ffill')
#make summary table with original mean/median and imputed means/medians

table1 = pd.DataFrame({
    'Strategy': ['Original', 'Mean', 'Median', 'Forward Fill'],
    'Mean': [df['cholesterol_total'].mean(), cholesterol_mean['cholesterol_total'].mean(),
                cholesterol_median['cholesterol_total'].mean(),
                cholesterol_ffill['cholesterol_total'].mean()],
    'Median': [df['cholesterol_total'].median(), cholesterol_mean['cholesterol_total'].median(), 
               cholesterol_median['cholesterol_total'].median(), 
               cholesterol_ffill['cholesterol_total'].median()],
    'Total Filled Values': [0, missing_counts['cholesterol_total'], 
                            missing_counts['cholesterol_total'],
                            missing_counts['cholesterol_total']]
})
display(table1)

  cholesterol_ffill = df.fillna(method='ffill')


Unnamed: 0,Strategy,Mean,Median,Total Filled Values
0,Original,178.039488,178.039488,0
1,Mean,178.039488,178.039488,554
2,Median,178.039488,178.039488,554
3,Forward Fill,178.039488,178.039488,554


## Part 3: Dropping Missing Data (3 points)

1. Drop rows where ANY column has missing data - how many rows remain?
2. Drop rows where specific columns have missing data (e.g., only 'age' or 'bmi')
3. Which approach loses less data?

In [4]:
# TODO: Drop missing rows with different strategies
drop_any = df.dropna(how='any')

drop_subset = df.dropna(subset = ['age', 'enrollment_date', 'patient_id'])

print(len(drop_any), len(drop_subset))

#We can see that dropping rows through subset loses less data.

7133 10000


## Part 4: Create Clean Dataset (3 points)

Apply your chosen strategy to create a clean dataset:
1. Choose appropriate imputation for numeric columns
2. Drop rows with missing critical values (e.g., patient_id, age)
3. Save to `output/q5_cleaned_data.csv`
4. Save a missing data report to `output/q5_missing_report.txt`

In [5]:
# TODO: Create and save clean dataset
df_clean = df.copy()
df_clean = df.dropna(subset = ['patient_id', 'age', 'enrollment_date'])
num_rows_removed = len(df) - len(df_clean)

crit_removed = detect_missing(df_clean)
display(crit_removed)

df_cols = df_clean.select_dtypes(include = 'number')
df_clean = fill_missing(df_clean, df_cols.columns, strategy='median')
display(df_clean)

after_fill = detect_missing(df_clean)
display(after_fill)

df_clean.to_csv('output/q5_cleaned_data.csv', index=False)
crit_removed.to_csv('data/critical_missing_counts.csv', index=True, header=['Missing Values'])

with open('output/q5_missing_report.txt', 'w') as f:
    df_string = crit_removed.to_string(header=True, index=True)
    f.write('number of rows removed for critical missing data: ' + str(num_rows_removed))
    f.write('\nValues before filling missing data:\n')
    f.write(df_string)
    f.write('\nValues after filling missing data:\n')
    df_string2 = after_fill.to_string(header=True, index=True)
    f.write(df_string2)


patient_id               0
age                      0
sex                      0
bmi                    438
enrollment_date          0
systolic_bp            414
diastolic_bp           414
cholesterol_total        0
cholesterol_hdl        554
cholesterol_ldl        554
glucose_fasting        369
site                     0
intervention_group       0
follow_up_months         0
adverse_events           0
outcome_cvd              0
adherence_pct         1467
dropout                  0
dtype: int64

Unnamed: 0,patient_id,age,sex,bmi,enrollment_date,systolic_bp,diastolic_bp,cholesterol_total,cholesterol_hdl,cholesterol_ldl,glucose_fasting,site,intervention_group,follow_up_months,adverse_events,outcome_cvd,adherence_pct,dropout
0,P00001,80,F,29.3,2022-05-01,123.0,80.0,120.0,55.0,41.0,118.0,site b,Control,20,0,No,24.0,No
1,P00002,80,Female,26.0,2022-01-06,139.0,81.0,206.0,58.0,107.0,79.0,Site A,CONTROL,24,0,No,77.0,No
2,P00003,82,Female,-1.0,2023-11-04,123.0,86.0,172.0,56.0,82.0,77.0,SITE C,treatment b,2,0,Yes,70.0,No
3,P00004,95,Female,25.4,2022-08-15,116.0,77.0,200.0,56.0,104.0,115.0,Site D,treatment b,17,0,No,62.0,No
4,P00005,95,M,26.0,2023-04-17,97.0,71.0,185.0,78.0,75.0,113.0,site e,Treatmen A,9,0,yes,62.0,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,P09996,72,Male,23.2,2022-04-11,122.0,73.0,182.0,54.0,92.0,97.0,site c,TREATMENT B,21,0,No,20.0,No
9996,P09997,100,Female,28.9,2023-02-10,124.0,78.0,157.0,56.0,70.0,102.0,Site C,control,11,0,No,57.0,No
9997,P09998,78,F,23.8,2023-11-05,110.0,63.0,154.0,69.0,71.0,114.0,Site C,treatment a,2,1,No,77.0,No
9998,P09999,86,F,27.0,2022-08-27,139.0,98.0,196.0,38.0,119.0,126.0,SITE A,CONTROL,16,0,no,63.0,No


patient_id            0
age                   0
sex                   0
bmi                   0
enrollment_date       0
systolic_bp           0
diastolic_bp          0
cholesterol_total     0
cholesterol_hdl       0
cholesterol_ldl       0
glucose_fasting       0
site                  0
intervention_group    0
follow_up_months      0
adverse_events        0
outcome_cvd           0
adherence_pct         0
dropout               0
dtype: int64

## Reflection

Which imputation strategy would you recommend for this dataset and why?

**Your answer:**

TODO: Explain your strategy choice


We select the fill method of median, as there are some noticeable outliers in the dataset that can affect the mean. The median is less affected by outliers and thus provides a more stable estimation for NaNs.