# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [2]:
# TODO: Type conversions
# 1. Use transform_types() to convert enrollment_date to datetime
try:
    df = transform_types(df)
except TypeError:
    df["enrollment_date"] = pd.to_datetime(df["enrollment_date"], errors="coerce")
except NameError:
    df["enrollment_date"] = pd.to_datetime(df["enrollment_date"], errors="coerce")
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
cat_cols = ["site", "intervention_group", "sex"]
for c in cat_cols:
    if c in df.columns:
        df[c] = df[c].astype("category")
# 3. Ensure all numeric columns are proper numeric types
numeric_like = ["cholesterol_ldl", "cholesterol_hdl", "systolic_bp", "age", "bmi"]
for c in numeric_like:
    if c in df.columns and df[c].dtype == "object":
        df[c] = pd.to_numeric(df[c].str.replace(",", "").str.strip(), errors="coerce")
# 4. Display the updated dtypes using df.dtypes
df.dtypes

patient_id                    object
age                            int64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
dtype: object

## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [3]:
# TODO: Calculate cholesterol ratio
if {"cholesterol_ldl", "cholesterol_hdl"}.issubset(df.columns):
    df["cholesterol_ratio"] = df["cholesterol_ldl"] / df["cholesterol_hdl"]

In [4]:
# TODO: Categorize blood pressure
def bp_bucket(x):
    if pd.isna(x):
        return np.nan
    if x < 120:
        return "Normal"
    if 120 <= x <= 129:
        return "Elevated"
    return "High"

if "systolic_bp" in df.columns:
    df["bp_category"] = df["systolic_bp"].apply(bp_bucket).astype("category")

**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [5]:
# TODO: Create age groups
age_bins = [0, 40, 55, 70, 100]
age_labels = ["<40", "40-54", "55-69", "70+"]
if "create_bins" in globals():
    if "age" in df.columns:
        df = create_bins(df, column="age", bins=age_bins, labels=age_labels, new_column="age_group")
else:
    if "age" in df.columns:
        df["age_group"] = pd.cut(df["age"], bins=age_bins, labels=age_labels, right=False)

if "age_group" in df.columns:
    df["age_group"] = df["age_group"].astype("category")

In [6]:
# TODO: Create BMI categories
bmi_bins = [0, 18.5, 25, 30, 100]
bmi_labels = ["Underweight", "Normal", "Overweight", "Obese"]
if "bmi" in df.columns:
    df["bmi_category"] = pd.cut(df["bmi"], bins=bmi_bins, labels=bmi_labels, right=False)
    df["bmi_category"] = df["bmi_category"].astype("category")

## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [7]:
# TODO: String cleaning
placeholders = {"", "na", "n/a", "none", "-", "null"}
obj_cols = df.select_dtypes(include="object").columns
for c in obj_cols:
    df[c] = (
        df[c]
        .astype(str)
        .str.strip()
        .str.lower()
        .map(lambda x: np.nan if x in placeholders else x)
    )

## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [8]:
# TODO: One-hot encoding
before = len(df)
df = df.drop_duplicates().copy()
after = len(df)
print(f"Removed {before - after} duplicate rows.")

ohe_cols = [c for c in ["intervention_group", "site"] if c in df.columns]
df = pd.get_dummies(df, columns=ohe_cols, prefix=ohe_cols, drop_first=False)

print(df.shape)
df.columns.tolist()

Removed 0 duplicate rows.
(10000, 80)


['patient_id',
 'age',
 'sex',
 'bmi',
 'enrollment_date',
 'systolic_bp',
 'diastolic_bp',
 'cholesterol_total',
 'cholesterol_hdl',
 'cholesterol_ldl',
 'glucose_fasting',
 'follow_up_months',
 'adverse_events',
 'outcome_cvd',
 'adherence_pct',
 'dropout',
 'cholesterol_ratio',
 'bp_category',
 'age_group',
 'bmi_category',
 'intervention_group_  CONTROL  ',
 'intervention_group_  Contrl  ',
 'intervention_group_  Control  ',
 'intervention_group_  TREATMENT A  ',
 'intervention_group_  TREATMENT B  ',
 'intervention_group_  Treatmen A  ',
 'intervention_group_  Treatment  B  ',
 'intervention_group_  Treatment A  ',
 'intervention_group_  Treatment B  ',
 'intervention_group_  TreatmentA  ',
 'intervention_group_  control  ',
 'intervention_group_  treatment a  ',
 'intervention_group_  treatment b  ',
 'intervention_group_CONTROL',
 'intervention_group_Contrl',
 'intervention_group_Control',
 'intervention_group_TREATMENT A',
 'intervention_group_TREATMENT B',
 'intervention_group

## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [9]:
# TODO: Save transformed data
# df_transformed.to_csv('output/q6_transformed_data.csv', index=False)
output_path = "output/q6_transformed_data.csv"
import os
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df.to_csv(output_path, index=False)
print(f"Saved → {output_path}")

Saved → output/q6_transformed_data.csv
