# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing

df = load_data('data/clinical_trial_raw.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 10000 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [2]:
from q3_data_utils import transform_types

# Load cleaned dataset from Q5
df = pd.read_csv('output/q5_cleaned_data.csv')
print(f"Loaded {df.shape[0]} rows, {df.shape[1]} columns")

# 1Ô∏è‚É£ Convert enrollment_date to datetime
df = transform_types(df, {'enrollment_date': 'datetime'})

# 2Ô∏è‚É£ Convert categorical columns if present
cat_cols = ['site', 'intervention_group', 'sex']
existing_cat_cols = [c for c in cat_cols if c in df.columns]
df = transform_types(df, {c: 'category' for c in existing_cat_cols})

# 3Ô∏è‚É£ Convert existing numeric columns to numeric dtype
# automatically detect numeric-like columns
possible_numeric = ['age','bmi','ldl','hdl','cholesterol_total',
                    'systolic_bp','diastolic_bp']

existing_numeric_cols = [c for c in possible_numeric if c in df.columns]

for col in existing_numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("‚úÖ Updated dtypes:")
display(df.dtypes)



Loaded 10000 rows, 18 columns
‚úÖ Updated dtypes:


patient_id                    object
age                            int64
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
site                        category
intervention_group          category
sex                         category
dtype: object

## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [3]:
# ‚úÖ Calculate cholesterol ratio (LDL / HDL)
if all(col in df.columns for col in ['cholesterol_ldl', 'cholesterol_hdl']):
    df['cholesterol_ratio'] = df['cholesterol_ldl'] / df['cholesterol_hdl']
else:
    print("‚ö†Ô∏è cholesterol_ratio not created ‚Äî missing LDL or HDL columns")


In [4]:
# ‚úÖ Categorize systolic BP
if 'systolic_bp' in df.columns:
    df['bp_category'] = pd.cut(
        df['systolic_bp'],
        bins=[0, 120, 129, float('inf')],
        labels=['Normal', 'Elevated', 'High'],
        include_lowest=True
    )
else:
    print("‚ö†Ô∏è bp_category not created ‚Äî missing systolic_bp column")


**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [5]:
# ‚úÖ Create age group bins using Q3 utility
if 'age' in df.columns:
    df = create_bins(
        df,
        column='age',
        bins=[0, 40, 55, 70, 100],
        labels=['<40', '40-54', '55-69', '70+'],
        new_column='age_group'
    )
else:
    print("‚ö†Ô∏è age_group not created ‚Äî missing age column")


In [6]:
# ‚úÖ Create BMI category bins using Q3 utility
if 'bmi' in df.columns:
    df = create_bins(
        df,
        column='bmi',
        bins=[0, 18.5, 25, 30, 100],
        labels=['Underweight', 'Normal', 'Overweight', 'Obese'],
        new_column='bmi_category'
    )
else:
    print("‚ö†Ô∏è bmi_category not created ‚Äî missing bmi column")


## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [7]:
# String cleaning for text-based columns

# Convert all column names to lowercase + strip whitespace
df.columns = df.columns.str.lower().str.strip()

# Identify object (string) columns
string_cols = df.select_dtypes(include=['object']).columns

# Clean string values: lowercase + strip whitespace
for col in string_cols:
    df[col] = df[col].astype(str).str.lower().str.strip()

print("‚úÖ String cleaning completed!")
print("Cleaned string columns:", list(string_cols))


‚úÖ String cleaning completed!
Cleaned string columns: ['patient_id', 'outcome_cvd', 'dropout']


In [8]:
#Checking if site, intervention_group and sex are restored to category type
display(df.dtypes)


patient_id                    object
age                            int64
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
site                        category
intervention_group          category
sex                         category
cholesterol_ratio            float64
bp_category                 category
age_group                   category
bmi_category                category
dtype: object

## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [9]:
# ‚úÖ 1. One-hot encode 'intervention_group'
df_encoded = pd.get_dummies(df, columns=["intervention_group"], prefix="intervention")

# ‚úÖ 2. One-hot encode 'site'
df_encoded = pd.get_dummies(df_encoded, columns=["site"], prefix="site")

# ‚úÖ 3. Drop original categorical columns
# (Already dropped automatically by pd.get_dummies(columns=...) since we used `columns=`)
# But if they still exist, you can safely do:
df_encoded = df_encoded.drop(columns=["intervention_group", "site"], errors="ignore")

# ‚úÖ 4. Show the new shape and column names with types
print("‚úÖ New DataFrame shape:", df_encoded.shape)
print("\nüìã Column names and data types:\n")
print(df_encoded.dtypes)

‚úÖ New DataFrame shape: (10000, 28)

üìã Column names and data types:

patient_id                          object
age                                  int64
bmi                                float64
enrollment_date             datetime64[ns]
systolic_bp                        float64
diastolic_bp                       float64
cholesterol_total                  float64
cholesterol_hdl                    float64
cholesterol_ldl                    float64
glucose_fasting                    float64
follow_up_months                     int64
adverse_events                       int64
outcome_cvd                         object
adherence_pct                      float64
dropout                             object
sex                               category
cholesterol_ratio                  float64
bp_category                       category
age_group                         category
bmi_category                      category
intervention_control                  bool
intervention_treatment a

## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [10]:
# TODO: Save transformed data
# Save transformed dataset
df.to_csv('output/q6_transformed_data.csv', index=False)

print(" Saved transformed dataset ‚Üí output/q6_transformed_data.csv")


 Saved transformed dataset ‚Üí output/q6_transformed_data.csv


In [11]:
print(df_encoded.dtypes.to_string())


patient_id                          object
age                                  int64
bmi                                float64
enrollment_date             datetime64[ns]
systolic_bp                        float64
diastolic_bp                       float64
cholesterol_total                  float64
cholesterol_hdl                    float64
cholesterol_ldl                    float64
glucose_fasting                    float64
follow_up_months                     int64
adverse_events                       int64
outcome_cvd                         object
adherence_pct                      float64
dropout                             object
sex                               category
cholesterol_ratio                  float64
bp_category                       category
age_group                         category
bmi_category                      category
intervention_control                  bool
intervention_treatment a              bool
intervention_treatment b              bool
site_site a