# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('output/q5_cleaned_data.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [2]:
# 1. Use transform_types() to convert enrollment_date to datetime
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
# 3. Ensure all numeric columns are proper numeric types
# Note: Numeric columns already properly classified, 
# patient_id | object => string, outcome_cvd, dropout | object => category
type_map = {
    'enrollment_date' : 'datetime',
    'site' : 'category',
    'intervention_group' : 'category',
    'sex' : 'category',
    'outcome_cvd' : 'category',
    'dropout' : 'category',
    'patient_id' : 'string'
}

# 4. Display the updated dtypes using df.dtypes
df = transform_types(df, type_map)
print(f"Updated data types\n{df.dtypes}")

Updated data types
patient_id            string[python]
age                            int64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                 category
adherence_pct                float64
dropout                     category
dtype: object


## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [3]:
# Calculate cholesterol ratio
df['cholesterol_ratio'] = df['cholesterol_ldl'] / df['cholesterol_hdl']
print(df['cholesterol_ratio'])

0       0.745455
1       1.844828
2       1.464286
3       1.857143
4       0.961538
          ...   
9995    1.703704
9996    1.250000
9997    1.028986
9998    3.131579
9999    2.294118
Name: cholesterol_ratio, Length: 10000, dtype: float64


In [4]:
# Categorize blood pressure
df['bp_category'] = pd.cut(
    df['systolic_bp'],
    bins = [0, 120, 130, 200],
    labels = ['Normal', 'Elevated', 'High'],
    right = False
)
print(df['bp_category'])

0       Elevated
1           High
2       Elevated
3         Normal
4         Normal
          ...   
9995    Elevated
9996    Elevated
9997      Normal
9998        High
9999        High
Name: bp_category, Length: 10000, dtype: category
Categories (3, object): ['Normal' < 'Elevated' < 'High']


**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [5]:
# Create age groups
df = create_bins(
    df,
    column = 'age',
    bins = [0, 40, 55, 70, 100],
    labels = ['<40', '40-54', '55-69', '70+'],
    new_column = 'age_group'
)
print(df['age_group'])

0         70+
1         70+
2         70+
3         70+
4         70+
        ...  
9995      70+
9996      NaN
9997      70+
9998      70+
9999    55-69
Name: age_group, Length: 10000, dtype: category
Categories (4, object): ['<40' < '40-54' < '55-69' < '70+']


In [6]:
#  Create BMI categories
df['bmi_category'] = pd.cut(
    df['bmi'],
    bins = [0, 18.5, 25, 30, 200],
    labels = ['Underweight', 'Normal', 'Overweight', 'Obese'],
    right = False
)
print(df['bmi_category'])

0       Overweight
1       Overweight
2              NaN
3       Overweight
4       Overweight
           ...    
9995        Normal
9996    Overweight
9997        Normal
9998    Overweight
9999    Overweight
Name: bmi_category, Length: 10000, dtype: category
Categories (4, object): ['Underweight' < 'Normal' < 'Overweight' < 'Obese']


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [7]:
# Clean string and categorical
string_cleaning = df.select_dtypes(include = ['string[python]', 'category']).columns.tolist()
for cleaner in string_cleaning:
    if cleaner in ['age_group', 'bp_category', 'bmi_category']:
        continue
    df[cleaner] = df[cleaner].astype(str).str.lower().str.strip()

# Do specific cleaning of variables    
df['sex'] = df['sex'].replace(['f', 'm'], ['female', 'male'])
df['site'] = df['site'].replace(['site_d', 'site  a'], ['site d', 'site a'])
df['intervention_group'] = df['intervention_group'].replace(['treatmen a', 'treatmenta'], 'treatment a')
df['intervention_group'] = df['intervention_group'].replace(['treatment  b', 'contrl'], ['treatment b', 'control'])

# Convert strings back to category
for column in ['sex', 'site', 'intervention_group', 'outcome_cvd', 'dropout']:
    df[column] = df[column].astype('category')


## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [8]:
# One-hot encoding
df = pd.get_dummies(df, columns = ['intervention_group', 'site'], drop_first = True)
print(f"Shape of new data frame: {df.shape}")
print(f"Column names of new daa frame: {df.columns}")


Shape of new data frame: (10000, 26)
Column names of new daa frame: Index(['patient_id', 'age', 'sex', 'bmi', 'enrollment_date', 'systolic_bp',
       'diastolic_bp', 'cholesterol_total', 'cholesterol_hdl',
       'cholesterol_ldl', 'glucose_fasting', 'follow_up_months',
       'adverse_events', 'outcome_cvd', 'adherence_pct', 'dropout',
       'cholesterol_ratio', 'bp_category', 'age_group', 'bmi_category',
       'intervention_group_treatment a', 'intervention_group_treatment b',
       'site_site b', 'site_site c', 'site_site d', 'site_site e'],
      dtype='object')


## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [9]:
# Save transformed data
df.to_csv('output/q6_transformed_data.csv', index = False)
