# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('output/q5_cleaned_data.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


In [2]:
df.head(25)

Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,bmi,enrollment_date,systolic_bp,diastolic_bp,cholesterol_total,cholesterol_hdl,cholesterol_ldl,glucose_fasting,site,intervention_group,follow_up_months,adverse_events,outcome_cvd,adherence_pct,dropout
0,0,P00001,80,F,29.3,2022-05-01,123.0,80.0,120.0,55.0,41.0,118.0,site b,Control,20,0,No,24.0,No
1,1,P00002,80,Female,29.3,2022-01-06,139.0,81.0,206.0,58.0,107.0,79.0,Site A,CONTROL,24,0,No,77.0,No
2,2,P00003,82,Female,-1.0,2023-11-04,123.0,86.0,172.0,56.0,82.0,77.0,SITE C,treatment b,2,0,Yes,70.0,No
3,3,P00004,95,Female,25.4,2022-08-15,116.0,77.0,200.0,56.0,104.0,115.0,Site D,treatment b,17,0,No,62.0,No
4,4,P00005,95,M,25.4,2023-04-17,97.0,71.0,185.0,78.0,75.0,113.0,site e,Treatmen A,9,0,yes,62.0,Yes
5,5,P00006,78,F,26.8,2023-08-29,116.0,66.0,164.0,54.0,99.0,99.0,Site A,TreatmentA,4,0,yes,62.0,Yes
6,6,P00007,84,F,25.4,2022-05-12,133.0,100.0,215.0,62.0,113.0,70.0,site a,treatment a,20,1,No,76.0,No
7,7,P00008,70,Male,24.7,2022-06-04,111.0,72.0,174.0,60.0,94.0,109.0,SITE B,TREATMENT A,19,0,No,53.0,No
8,8,P00009,92,Female,26.9,2022-04-06,111.0,72.0,189.0,62.0,89.0,103.0,site a,Control,21,0,yes,53.0,No
9,9,P00010,75,Male,21.1,2023-12-14,128.0,76.0,218.0,77.0,97.0,96.0,SITE A,Treatment B,1,0,No,50.0,No


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [3]:
# TODO: Type conversions
# 1. Use transform_types() to convert enrollment_date to datetime
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
# 3. Ensure all numeric columns are proper numeric types
# 4. Display the updated dtypes using df.dtypes
type_map = {
    "enrollment_date":"datetime",
    "site":"category",
    "intervention_group":"category",
    "sex":"category",
    "age":"numeric",
    "bmi":"numeric",
    "systolic_bp":"numeric",
    "diastolic_bp":"numeric",
    "cholesterol_total":"numeric",
    "cholesterol_hdl":"numeric",
    "cholesterol_ldl":"numeric",
    "glucose_fasting":"numeric",
    "follow_up_months":"numeric",
    "adverse_events":"numeric",
    "adherence_pct":"numeric"
}

temp = transform_types(df,type_map)
display(temp.dtypes)






  df_copy[key] = pd.to_datetime(df_copy[key], errors='coerce', infer_datetime_format=True)


Unnamed: 0                     int64
patient_id                    object
age                            int64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
dtype: object

## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [4]:
# TODO: Calculate cholesterol ratio
df["cholesterol_ratio"] = df["cholesterol_ldl"]/df["cholesterol_hdl"]

In [5]:
# TODO: Categorize blood pressure
df["bp_category"] = pd.cut(df["systolic_bp"], bins=[0, 119, 129, float("inf")], labels=["Normal","Elevated","High"])

**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [6]:
# TODO: Create age groups
bins = [0,40,55,70,100]
labels = ["<40", "40-54", "55-69", "70+"]
df = create_bins(df, "age", bins,labels, "age_group")

In [7]:
# TODO: Create BMI categories
df["bmi_category"] = pd.cut(df["bmi"], bins=[0,18.5,24.9,29.9,float("inf")], labels=["Underweight","Normal","Overweight","Obese"])

In [8]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,bmi,enrollment_date,systolic_bp,diastolic_bp,cholesterol_total,cholesterol_hdl,...,intervention_group,follow_up_months,adverse_events,outcome_cvd,adherence_pct,dropout,cholesterol_ratio,bp_category,age_group,bmi_category
0,0,P00001,80,F,29.3,2022-05-01,123.0,80.0,120.0,55.0,...,Control,20,0,No,24.0,No,0.745455,Elevated,70+,Overweight
1,1,P00002,80,Female,29.3,2022-01-06,139.0,81.0,206.0,58.0,...,CONTROL,24,0,No,77.0,No,1.844828,High,70+,Overweight
2,2,P00003,82,Female,-1.0,2023-11-04,123.0,86.0,172.0,56.0,...,treatment b,2,0,Yes,70.0,No,1.464286,Elevated,70+,
3,3,P00004,95,Female,25.4,2022-08-15,116.0,77.0,200.0,56.0,...,treatment b,17,0,No,62.0,No,1.857143,Normal,70+,Overweight
4,4,P00005,95,M,25.4,2023-04-17,97.0,71.0,185.0,78.0,...,Treatmen A,9,0,yes,62.0,Yes,0.961538,Normal,70+,Overweight
5,5,P00006,78,F,26.8,2023-08-29,116.0,66.0,164.0,54.0,...,TreatmentA,4,0,yes,62.0,Yes,1.833333,Normal,70+,Overweight
6,6,P00007,84,F,25.4,2022-05-12,133.0,100.0,215.0,62.0,...,treatment a,20,1,No,76.0,No,1.822581,High,70+,Overweight
7,7,P00008,70,Male,24.7,2022-06-04,111.0,72.0,174.0,60.0,...,TREATMENT A,19,0,No,53.0,No,1.566667,Normal,55-69,Normal
8,8,P00009,92,Female,26.9,2022-04-06,111.0,72.0,189.0,62.0,...,Control,21,0,yes,53.0,No,1.435484,Normal,70+,Overweight
9,9,P00010,75,Male,21.1,2023-12-14,128.0,76.0,218.0,77.0,...,Treatment B,1,0,No,50.0,No,1.25974,Elevated,70+,Normal


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [9]:
# TODO: String cleaning

df["sex"] = df["sex"].str.lower().str.strip()  
df["sex"] = df["sex"].replace({
    "male": "m",
    "female": "f",
    "MALE":"m",
    "FEMALE":"f",
    "Male":"m",
    "Female":"f"
})

df["intervention_group"] = df["intervention_group"].str.lower().str.strip()  
df["intervention_group"] = df["intervention_group"].replace({
    "Control": "control",
    "CONTROL": "control",
    "Contrl": "control",
    "contrl": "control",
    "TreatmentA": "treatment a",
    "TREATMENT A": "treatment a",
    "Treatmen A": "treatment a",
    "Treatment A": "treatment a",
    "treatmen a": "treatment a",
    "treatmenta": "treatment a",
    "Treatment B": "treatment b",
    "TREATMENT B": "treatment b",
    "treatment  b": "treatment b"
})

df["site"] = df["site"].str.lower().str.strip()  
df["site"] = df["site"].replace({
    "Site A": "site a",
    "SITE A": "site a",
    "SITE B": "site b",
    "Site B": "site b",
    "site_d": "site d",
    "site  a": "site a"
})

df["outcome_cvd"] = df["outcome_cvd"].str.lower().str.strip()  
df["outcome_cvd"] = df["outcome_cvd"].replace({
    "No": "no",
    "Yes": "yes"
})

df["dropout"] = df["dropout"].str.lower().str.strip()  
df["dropout"] = df["dropout"].replace({
    "No": "no",
    "Yes": "yes"
})

df["outcome_cvd"] = df["outcome_cvd"].str.lower().str.strip()  
df["outcome_cvd"] = df["outcome_cvd"].replace({
    "No": 0,
    "Yes": 1
})






In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,bmi,enrollment_date,systolic_bp,diastolic_bp,cholesterol_total,cholesterol_hdl,...,intervention_group,follow_up_months,adverse_events,outcome_cvd,adherence_pct,dropout,cholesterol_ratio,bp_category,age_group,bmi_category
0,0,P00001,80,f,29.3,2022-05-01,123.0,80.0,120.0,55.0,...,control,20,0,no,24.0,no,0.745455,Elevated,70+,Overweight
1,1,P00002,80,f,29.3,2022-01-06,139.0,81.0,206.0,58.0,...,control,24,0,no,77.0,no,1.844828,High,70+,Overweight
2,2,P00003,82,f,-1.0,2023-11-04,123.0,86.0,172.0,56.0,...,treatment b,2,0,yes,70.0,no,1.464286,Elevated,70+,
3,3,P00004,95,f,25.4,2022-08-15,116.0,77.0,200.0,56.0,...,treatment b,17,0,no,62.0,no,1.857143,Normal,70+,Overweight
4,4,P00005,95,m,25.4,2023-04-17,97.0,71.0,185.0,78.0,...,treatment a,9,0,yes,62.0,yes,0.961538,Normal,70+,Overweight


## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [11]:
# TODO: One-hot encoding

cols = ["intervention_group", "site"]
encoded = pd.get_dummies(df, columns=cols, prefix=cols, drop_first=False)
print("New shape:", encoded.shape)
print("Columns:", encoded.columns.tolist())


New shape: (10000, 29)
Columns: ['Unnamed: 0', 'patient_id', 'age', 'sex', 'bmi', 'enrollment_date', 'systolic_bp', 'diastolic_bp', 'cholesterol_total', 'cholesterol_hdl', 'cholesterol_ldl', 'glucose_fasting', 'follow_up_months', 'adverse_events', 'outcome_cvd', 'adherence_pct', 'dropout', 'cholesterol_ratio', 'bp_category', 'age_group', 'bmi_category', 'intervention_group_control', 'intervention_group_treatment a', 'intervention_group_treatment b', 'site_site a', 'site_site b', 'site_site c', 'site_site d', 'site_site e']


## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [12]:
# TODO: Save transformed data
df.to_csv('output/q6_transformed_data.csv', index=False)