# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Run test on q3_data cleanup...
Test DataFrame created: (5, 2)
Test detect_missing: 1
Test passed!
Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [2]:
# TODO: Type conversions

print("="*60)
print("Data types before transformation:")
print("="*60)
print()

# 1. Use transform_types() to convert enrollment_date to datetime
print("1. Converting enrollment_date to datetime...")
type_map = {'enrollment_date': 'datetime'}
df_typed = transform_types(df, type_map)
print(" enrollment_date converted to datetime")
print()

# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
print("2. Converting categorical columns to category type...")
categorical_cols = {
    'site': 'category',
    'intervention_group': 'category',
    'sex': 'category'
}
df_typed = transform_types(df_typed, categorical_cols)
print(" Categorical columns converted to category type")
print()

# 3. Ensure all numeric columns are proper numeric types
print("3. Converting numeric columns to proper numeric type...")
numeric_map = {
    'age': 'numeric',
    'bmi': 'numeric',
    'weight': 'numeric',
    'height': 'numeric',
    'cholesterol_total': 'numeric',
    'systolic_bp': 'numeric',
    'diastolic_bp': 'numeric'
}
df_typed = transform_types(df_typed, numeric_map)
print(" Numeric columns converted to proper numeric type")
print()

# 4. Display the updated dtypes using df.dtypes
print("4. Updated data types:")
print(df_typed.dtypes)
print()


# Summary of changes
print("Summary of type changes:")
print(f" Datetime columns: {(df_typed.dtypes == 'datetime64[ns]').sum()}")
print(f" Categorical columns: {(df_typed.dtypes == 'category').sum()}")
print(f" Numeric columns: {(df_typed.select_dtypes(include=[np.number]).shape[1])}")
print()

print("="*60)

Data types before transformation:

1. Converting enrollment_date to datetime...
 enrollment_date converted to datetime

2. Converting categorical columns to category type...
 Categorical columns converted to category type

3. Converting numeric columns to proper numeric type...
 Numeric columns converted to proper numeric type

4. Updated data types:
patient_id             object
age                     int64
sex                    object
bmi                   float64
enrollment_date        object
systolic_bp           float64
diastolic_bp          float64
cholesterol_total     float64
cholesterol_hdl       float64
cholesterol_ldl       float64
glucose_fasting       float64
site                   object
intervention_group     object
follow_up_months        int64
adverse_events          int64
outcome_cvd            object
adherence_pct         float64
dropout                object
dtype: object

Summary of type changes:
 Datetime columns: 0
 Categorical columns: 0
 Numeric columns: 11



## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [3]:
# TODO: Calculate cholesterol ratio

print("1. Creating cholesterol ratio column...")
df_typed['cholesterol_ratio'] = df_typed['cholesterol_ldl'] / df_typed['cholesterol_hdl']
print(" Cholesterol ratio = cholesterol_ldl / cholesterol_hdl")
print(df_typed['cholesterol_ldl', 'cholesterol_hdl', 'cholesterol_ratio'].head(10))

1. Creating cholesterol ratio column...
 Cholesterol ratio = cholesterol_ldl / cholesterol_hdl


KeyError: ('cholesterol_ldl', 'cholesterol_hdl', 'cholesterol_ratio')

In [None]:
# TODO: Categorize blood pressure
df_typed['bp_category'] = pd.cut(
    df_typed['systolic_bp'],
    bins=[0, 10, 130, float('inf')],
    labels=['Normal', 'Elevated', 'High'],
    right=False
)
print("2. BP categories created")
print(df_typed['systolic_bp', 'bp_category'].head(10))
print()

# ACtually, I like this better
df_typed['bp_category'] = np.where(
    df_typed['systolic_bp'] < 120, 'Normal',
    np.where(
        (df_typed['systolic_bp'] >= 120) & (df_typed['systolic_bp'] < 130), 'Elevated',
        'High'
    )
)
print("BP categories created")
print(df_typed[['systolic_bp', 'bp_category']].head(10))

**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [None]:
# TODO: Create age groups
df_typed = create_bins(
    df_typed,
    column = 'age',
    bins=[0, 40, 55, 70, 100],
    labels = ['<40', '40-54', '55-69', '70+']
    new_column = 'age_group'
)
print("3. Age groups created")
print(df_typed[['age', 'age_group']].head(10))

In [None]:
# TODO: Create BMI categories
df_typed = create_bins(
    df_typed,
    column = 'bmi',
    bins=[0, 18.5, 24.9, 29.9, float('inf')],
    labels = ['Underweight', 'Normal weight', 'Overweight', 'Obese'],
    new_column = 'bmi_category'
)
# bins might be 0, 18.5, 25, 30, 100]

print("4. BMI categories created")
print(df_typed[['bmi', 'bmi_category']].head(10))


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [None]:
# TODO: String cleaning
string_cols = df_typed.select_dtypes(include=['object','category']).columns
original_categories = df_typed.select_dtypes(include=['category']).columns.tolist()

for col in string_cols:
    df_typed[col] = (df_typed[col]
                     .astype(str)
                     .str.lower()
                     .str.strip()
                     .replace(['unknown', 'n/a', 'na', 'none', ''], np.nan ))
    
for col in original_categories:
    df_typed[col] = df_typed[col].astype('category')
    
print(f"Cleaned {len(string_cols)} columns: {list(string_cols)}")
print(f" Restored {len(original_categories)} columns to category type")

## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [None]:
# TODO: One-hot encoding
# 1. one-hot encode 'intervention_group'
intervention_dummies = pd.get_dummies(df_typed['intervention_group'], prefix='intervention')
df_typed = pd.concat([df_typed, intervention_dummies], axis=1)
print(f"1. Created dummy variables: {list(intervention_dummies.columns)}")
print(df_typed[['intervention_group'] + list(intervention_dummies.columns)].head(10))

# 2. one-hot encode 'site'
site_dummies = pd.get_dummies(df_typed['site'], prefix='site')
df_typed = pd.concat([df_typed, site_dummies], axis=1)
print(f"2. Created dummy variables: {list(site_dummies.columns)}")
print(df_typed[['site'] + list(site_dummies.columns)].head(10))

# 3. Drop original categorical columns
df_typed = df_typed.drop(columns=['intervention_group', 'site'])

# 4. Show new shape and column names
print("3. Original columns dropped")
print(f"New DataFrame shape: {df_typed.shape}")
print(f"Column names: {list(df_typed.columns.tolist())}")


## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [None]:
# TODO: Save transformed data
# df_transformed.to_csv('output/q6_transformed_data.csv', index=False)
df_typed.to_csv('output/q6_transformed_data.csv', index=False)

print("Transformed data saved to output/q6_transformed_data.csv")