# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import (
    load_data,
    clean_data,
    transform_types,
    create_bins,
    fill_missing,
)

df = load_data("data/clinical_trial_raw.csv")
print(f"Loaded {len(df)} patients")


# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.

    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.show()


def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.

    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind="bar")
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [None]:
# TODO: Type conversions
# 1. Use transform_types() to convert enrollment_date to datetime
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
# 3. Ensure all numeric columns are proper numeric types
# 4. Display the updated dtypes using df.dtypes
type_map = {
    "enrollment_date": "datetime",  # use 'datetime' to match transform_types()
    "site": "category",
    "intervention_group": "category",
    "sex": "category",
}

df = transform_types(df, type_map)

cols = df.select_dtypes(include=["object"]).columns
for col in cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

print("Updated Data Types:")
print(df.dtypes)

Updated Data Types:
patient_id                   float64
age                            int64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                  float64
adherence_pct                float64
dropout                      float64
dtype: object


## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [None]:
# TODO: Calculate cholesterol ratio
df["cholesterol_ratio"] = df["cholesterol_ldl"] / df["cholesterol_hdl"]
print(df[["cholesterol_ratio"]].head())

   cholesterol_ratio
0           0.745455
1           1.844828
2           1.464286
3           1.857143
4           0.961538


In [None]:
# TODO: Categorize blood pressure
df["bp_category"] = np.where(
    df["systolic_bp"] < 120,
    "Normal",
    np.where(df["systolic_bp"] < 130, "Elevated", "High"),
)
print(df[["systolic_bp", "bp_category"]].head())

   systolic_bp bp_category
0        123.0    Elevated
1        139.0        High
2        123.0    Elevated
3        116.0      Normal
4         97.0      Normal


**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [None]:
# TODO: Create age groups
df = create_bins(
    df,
    column="age",
    bins=[0, 40, 55, 70, 100],
    labels=["<40", "40-54", "55-69", "70+"],
    new_column="age_group",
)

print(df[["age", "age_group"]].head())

   age age_group
0   80       70+
1   80       70+
2   82       70+
3   95       70+
4   95       70+


In [None]:
# TODO: Create BMI categories
df = clean_data(df, sentinel_value=-999)  # Clean sentinel values
df["bmi"] = df["bmi"].replace(["NA", "None", -1, 0], np.nan)
df = fill_missing(df, "bmi", strategy="median")

df = create_bins(
    df,
    column="bmi",
    bins=[0, 18.5, 24.9, 29.9, 100],
    labels=["Underweight", "Normal", "Overweight", "Obese"],
    new_column="bmi_category",
)

print(df[["bmi", "bmi_category"]].head())

    bmi bmi_category
0  29.3   Overweight
1  26.2   Overweight
2  26.2   Overweight
3  25.4   Overweight
4  26.2   Overweight


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [53]:
# TODO: String cleaning
string_cols = df.select_dtypes(include=["object"]).columns

for col in string_cols:
    df[col] = df[col].str.lower()
    df[col] = df[col].str.strip()
    df[col] = df[col].replace(
        ["na", "none", "missing", "n/a", "null", "nan", ""], np.nan
    )

print("String Columns Cleaned:", list(string_cols))

String Columns Cleaned: ['bp_category']


## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [None]:
# TODO: One-hot encoding
categorical_cols = ["intervention_group", "site"]
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

print("New DataFrame shape:", df_encoded.shape)
print("New columns after encoding:")
print(df_encoded.columns.tolist()[:15], "...")

New DataFrame shape: (10000, 80)
New columns after encoding:
['patient_id', 'age', 'sex', 'bmi', 'enrollment_date', 'systolic_bp', 'diastolic_bp', 'cholesterol_total', 'cholesterol_hdl', 'cholesterol_ldl', 'glucose_fasting', 'follow_up_months', 'adverse_events', 'outcome_cvd', 'adherence_pct'] ...


## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [56]:
# TODO: Save transformed data
# df_transformed.to_csv('output/q6_transformed_data.csv', index=False)
df_transformed = df_encoded
df_transformed.to_csv("output/q6_transformed_data.csv", index=False)