# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [2]:
# TODO: Type conversions
# 1. Use transform_types() to convert enrollment_date to datetime
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
# 3. Ensure all numeric columns are proper numeric types
# 4. Display the updated dtypes using df.dtypes

# Converting enrollment_date and categorical columns
df_copy = df.copy()
type_map = {
    'enrollment_date' : 'datetime',
    'site' : 'category', 
    'intervention_group': 'category',
    'sex': 'category'
}
df_copy = transform_types(df_copy, type_map)

# Ensure all numeric columns are proper numeric types 
numeric_cols = df_copy.select_dtypes(include='number').columns
print('Numeric columns:', numeric_cols)
print(df_copy.dtypes)



Numeric columns: Index(['age', 'bmi', 'systolic_bp', 'diastolic_bp', 'cholesterol_total',
       'cholesterol_hdl', 'cholesterol_ldl', 'glucose_fasting',
       'follow_up_months', 'adverse_events', 'adherence_pct'],
      dtype='object')
patient_id                    object
age                            int64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
dtype: object


## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [3]:
# TODO: Calculate cholesterol ratio

df_copy['cholesterol_ratio'] = df_copy['cholesterol_ldl'] / df_copy['cholesterol_hdl']
df_copy[['cholesterol_ratio']].head()


Unnamed: 0,cholesterol_ratio
0,0.745455
1,1.844828
2,1.464286
3,1.857143
4,0.961538


In [4]:
# TODO: Categorize blood pressure

df_copy['bp_category'] = np.where(
    df_copy['systolic_bp'] < 120, 'Normal',
    np.where(df_copy['systolic_bp'] <=129, 'Elevated', 'High')
)
df_copy[['systolic_bp', 'bp_category']].head()



Unnamed: 0,systolic_bp,bp_category
0,123.0,Elevated
1,139.0,High
2,123.0,Elevated
3,116.0,Normal
4,97.0,Normal


**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [5]:
# TODO: Create age groups
df_copy = create_bins(
    df = df_copy,
    column = 'age',
    bins = [0, 40, 55, 70, 100],
    labels = ['<40', '40-54', '55-69', '70+'],
    new_column = 'age_group'
)
df_copy[['age', 'age_group']].head()

Unnamed: 0,age,age_group
0,80,70+
1,80,70+
2,82,70+
3,95,70+
4,95,70+


In [6]:
# TODO: Create BMI categories
df_copy['bmi_category'] = np.where(
    df_copy['bmi'] < 18.5, 'Underweight',
    np.where(df_copy['bmi'] <=24.9, 'Normal',
    np.where(df_copy['bmi'] <=29.9, 'Overweight', 'Obese'))
)
df_copy[['bmi', 'bmi_category']].head()


Unnamed: 0,bmi,bmi_category
0,29.3,Overweight
1,,Obese
2,-1.0,Underweight
3,25.4,Overweight
4,,Obese


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [7]:
# TODO: String cleaning
placeholders = ['unknown', 'n/a', 'na', 'missing']

for col in df_copy.select_dtypes(include = 'object').columns:
    df_copy[col] = df_copy[col].str.lower()
    df_copy[col] = df_copy[col].str.strip
    df_copy[col] = df_copy[col].replace(placeholders, pd.NA)

## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [8]:
# TODO: One-hot encoding
dummies_intervention_group = pd.get_dummies(df_copy['intervention_group'], prefix = 'intervention')
dummies_site = pd.get_dummies(df_copy['site'], prefix = 'site')
df_copy = pd.concat([df_copy, dummies_intervention_group, dummies_site], axis=1)
df_copy = df_copy.drop(['intervention_group', 'site'], axis=1)
print('New Shape:', df_copy.shape)
print('Column Names:', df_copy.columns.tolist())

New Shape: (10000, 80)
Column Names: ['patient_id', 'age', 'sex', 'bmi', 'enrollment_date', 'systolic_bp', 'diastolic_bp', 'cholesterol_total', 'cholesterol_hdl', 'cholesterol_ldl', 'glucose_fasting', 'follow_up_months', 'adverse_events', 'outcome_cvd', 'adherence_pct', 'dropout', 'cholesterol_ratio', 'bp_category', 'age_group', 'bmi_category', 'intervention_  CONTROL  ', 'intervention_  Contrl  ', 'intervention_  Control  ', 'intervention_  TREATMENT A  ', 'intervention_  TREATMENT B  ', 'intervention_  Treatmen A  ', 'intervention_  Treatment  B  ', 'intervention_  Treatment A  ', 'intervention_  Treatment B  ', 'intervention_  TreatmentA  ', 'intervention_  control  ', 'intervention_  treatment a  ', 'intervention_  treatment b  ', 'intervention_CONTROL', 'intervention_Contrl', 'intervention_Control', 'intervention_TREATMENT A', 'intervention_TREATMENT B', 'intervention_Treatmen A', 'intervention_Treatment  B', 'intervention_Treatment A', 'intervention_Treatment B', 'intervention_Tr

## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [9]:
# TODO: Save transformed data
# df_transformed.to_csv('output/q6_transformed_data.csv', index=False)

df_copy.to_csv('output/q6_transformed_data.csv', index = False)
