# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [2]:
# TODO: Type conversions
# 1. Use transform_types() to convert enrollment_date to datetime
df = transform_types(df, {'enrollment_date': 'datetime'})
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
df = transform_types(df, {'site': 'category', 'intervention_group': 'category', 'sex': 'category'})
# 3. Ensure all numeric columns are proper numeric types
# 4. Display the updated dtypes using df.dtypes
print(df.dtypes)

patient_id                    object
age                            int64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
dtype: object


## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [3]:
# TODO: Calculate cholesterol ratio
df['cholesterol_ratio'] = df['cholesterol_ldl'] / df['cholesterol_hdl']
print(df[['cholesterol_ldl', 'cholesterol_hdl', 'cholesterol_ratio']].head())




   cholesterol_ldl  cholesterol_hdl  cholesterol_ratio
0             41.0             55.0           0.745455
1            107.0             58.0           1.844828
2             82.0             56.0           1.464286
3            104.0             56.0           1.857143
4             75.0             78.0           0.961538


In [4]:
# TODO: Categorize blood pressure
df =create_bins(df, column = "systolic_bp", bins = [0,120, 130,df['systolic_bp'].max()], labels = ['Normal', 'Elevated', 'High'], new_column = "bp_category")

print(df[['systolic_bp', 'bp_category']].head())

   systolic_bp bp_category
0        123.0    Elevated
1        139.0        High
2        123.0    Elevated
3        116.0      Normal
4         97.0      Normal


**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [5]:
# TODO: Create age groups
df =create_bins(df, column = "age", bins = [0, 40, 55, 70, 100], labels = ['<40', '40-54', '55-69', '70+'], new_column = "age_group")
print(df[['age', 'age_group']].head())


   age age_group
0   80       70+
1   80       70+
2   82       70+
3   95       70+
4   95       70+


In [6]:
# TODO: Create BMI categories
df =create_bins(df, column = "bmi", bins = [0, 18.5, 25, 30, df['bmi'].max()], labels = ['Underweight', 'Normal weight', 'Overweight', 'Obesity'], new_column = "bmi_category")
print(df[['bmi', 'bmi_category']].head())


    bmi bmi_category
0  29.3   Overweight
1   NaN          NaN
2  -1.0          NaN
3  25.4   Overweight
4   NaN          NaN


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [7]:
# TODO: String cleaning

df['sex'] = df['sex'].str.lower().str.replace("female", "f").str.replace("male", "m").str.strip()
print(df['sex'].unique())
df['site'] = df['site'].str.lower().str.replace(r"site|_", " ", regex=True).str.strip()
print(df['site'].unique())
df['intervention_group'] = df['intervention_group'].str.lower().str.replace("contrl", "control").str.replace(r"treatment|treatmen", " ", regex=True).str.strip()
print(df['intervention_group'].unique())


['f' 'm']
['b' 'a' 'c' 'd' 'e']
['control' 'b' 'a']


## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [8]:
# TODO: One-hot encoding
df_transformed = pd.get_dummies(df, columns=['site', 'intervention_group'])
print(df_transformed.head())
print(df_transformed.info())


  patient_id  age sex   bmi enrollment_date  systolic_bp  diastolic_bp  \
0     P00001   80   f  29.3      2022-05-01        123.0          80.0   
1     P00002   80   f   NaN      2022-01-06        139.0          81.0   
2     P00003   82   f  -1.0      2023-11-04        123.0          86.0   
3     P00004   95   f  25.4      2022-08-15        116.0          77.0   
4     P00005   95   m   NaN      2023-04-17         97.0          71.0   

   cholesterol_total  cholesterol_hdl  cholesterol_ldl  ...  age_group  \
0              120.0             55.0             41.0  ...        70+   
1              206.0             58.0            107.0  ...        70+   
2              172.0             56.0             82.0  ...        70+   
3              200.0             56.0            104.0  ...        70+   
4              185.0             78.0             75.0  ...        70+   

   bmi_category  site_a site_b  site_c site_d  site_e intervention_group_a  \
0    Overweight   False   True  

## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [10]:
# TODO: Save transformed data
df_transformed.to_csv('output/q6_transformed_data.csv', index=False)

