# Assignment 5, Question 6: Data Transformation

**Points: 20**

Transform and engineer features from the clinical trial dataset.

## Setup

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import utilities
from q3_data_utils import load_data, clean_data, transform_types, create_bins, fill_missing, detect_missing

df = load_data('output/q5_cleaned_data.csv')
print(f"Loaded {len(df)} patients")

# Prewritten visualization functions for transformation analysis
def plot_distribution(series, title, figsize=(10, 6)):
    """
    Create a histogram of a numeric series.
    
    Args:
        series: pandas Series with numeric data
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.hist(bins=30)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

def plot_value_counts(series, title, figsize=(10, 6)):
    """
    Create a bar chart of value counts.
    
    Args:
        series: pandas Series with value counts
        title: Chart title
        figsize: Figure size tuple
    """
    plt.figure(figsize=figsize)
    series.plot(kind='bar')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Loaded 9800 patients


## Part 1: Type Conversions (5 points)

1. Convert 'enrollment_date' to datetime using the `transform_types()` utility
2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
3. Ensure all numeric columns are proper numeric types
4. Display the updated dtypes

In [11]:
# TODO: Type conversions
# 1. Use transform_types() to convert enrollment_date to datetime
# 2. Convert categorical columns ('site', 'intervention_group', 'sex') to category dtype
# 3. Ensure all numeric columns are proper numeric types
# 4. Display the updated dtypes using df.dtypes
print("Data types before transformation:")
print(df.dtypes)

# Supported types: 'datetime', 'numeric', 'category', 'string'
type_mappings = {
    'enrollment_date': 'datetime',
    'site': 'category',
    'intervention_group': 'category',
    'sex': 'category'
}

# Apply type transformations
df = transform_types(df, type_mappings)
print("Data types after transformation:")
display(df.dtypes)

# Check for any remaining missing values due to type conversion issues
print(detect_missing(df))
print("\n", "1472 missing values found after type transformation in enrollment_date column")

# Drop rows with missing enrollment_date
df = df.dropna(subset=['enrollment_date'])

# Check for any remaining missing values
print(detect_missing(df))
print("\n", "No missing values found after dropping rows with missing enrollment_date")

Data types before transformation:
Unnamed: 0              int64
patient_id             object
age                   float64
sex                    object
bmi                   float64
enrollment_date        object
systolic_bp           float64
diastolic_bp          float64
cholesterol_total     float64
cholesterol_hdl       float64
cholesterol_ldl       float64
glucose_fasting       float64
site                   object
intervention_group     object
follow_up_months        int64
adverse_events          int64
outcome_cvd            object
adherence_pct         float64
dropout                object
dtype: object
Data types after transformation:


Unnamed: 0                     int64
patient_id                    object
age                          float64
sex                         category
bmi                          float64
enrollment_date       datetime64[ns]
systolic_bp                  float64
diastolic_bp                 float64
cholesterol_total            float64
cholesterol_hdl              float64
cholesterol_ldl              float64
glucose_fasting              float64
site                        category
intervention_group          category
follow_up_months               int64
adverse_events                 int64
outcome_cvd                   object
adherence_pct                float64
dropout                       object
dtype: object

Unnamed: 0               0
patient_id               0
age                      0
sex                      0
bmi                      0
enrollment_date       1472
systolic_bp              0
diastolic_bp             0
cholesterol_total        0
cholesterol_hdl          0
cholesterol_ldl          0
glucose_fasting          0
site                     0
intervention_group       0
follow_up_months         0
adverse_events           0
outcome_cvd              0
adherence_pct            0
dropout                  0
dtype: int64

 1472 missing values found after type transformation in enrollment_date column
Unnamed: 0            0
patient_id            0
age                   0
sex                   0
bmi                   0
enrollment_date       0
systolic_bp           0
diastolic_bp          0
cholesterol_total     0
cholesterol_hdl       0
cholesterol_ldl       0
glucose_fasting       0
site                  0
intervention_group    0
follow_up_months      0
adverse_events        0
outcome_cv

## Part 2: Feature Engineering (8 points)

Create these new calculated columns:

1. `cholesterol_ratio` = cholesterol_ldl / cholesterol_hdl
2. `bp_category` = categorize systolic BP:
   - 'Normal': < 120
   - 'Elevated': 120-129
   - 'High': >= 130
3. `age_group` using `create_bins()` utility:
   - Bins: [0, 40, 55, 70, 100]
   - Labels: ['<40', '40-54', '55-69', '70+']
4. `bmi_category` using standard BMI categories:
   - Underweight: <18.5
   - Normal: 18.5-24.9
   - Overweight: 25-29.9
   - Obese: >=30

In [12]:
# TODO: Calculate cholesterol ratio
df['cholesterol_ratio'] = df['cholesterol_ldl'] / df['cholesterol_hdl']

In [13]:
# TODO: Categorize blood pressure
df = create_bins(
    df,
    column = 'systolic_bp',
    bins = [0, 120, 130, np.inf],
    labels = ['Normal', 'Elevated', 'High'],
    new_column = 'bp_category'
)

**Note:** The `create_bins()` function has an optional `new_column` parameter. If you don't specify it, the new column will be named `{original_column}_binned`. You can use `new_column='age_group'` to give it a custom name.


In [14]:
# TODO: Create age groups
df = create_bins(
    df,
    column = 'age',
    bins = [0, 40, 55, 70, 100],
    labels = ['<40', '40-54', '55-69', '70+'],
    new_column = 'age_group'
)

In [15]:
# TODO: Create BMI categories
df = create_bins(
    df,
    column = 'bmi',
    bins = [0, 18.5, 25, 30, np.inf],
    labels = ['Underweight', 'Normal', 'Overweight', 'Obese'],
    new_column = 'bmi_category'
)

## Part 3: String Cleaning (2 points)

If there are any string columns that need cleaning:
1. Convert to lowercase
2. Strip whitespace
3. Replace any placeholder values

In [16]:
# TODO: String cleaning
pd.set_option('display.max_columns', None)
display(df.head(20))

print("Before cleaning:", "\n")
print(df['sex'].unique(), "\n")
print(df['site'].unique(), "\n")
print(df['intervention_group'].unique(), "\n")
print(df['outcome_cvd'].unique(), "\n")
print(df['dropout'].unique()), "\n"

# Clean sex, outcome_cvd, dropout columns
for col in ['sex', 'outcome_cvd', 'dropout']:
    df[col] = (df[col]
               .str.strip()  # removes leading/trailing whitespace
               .str.lower()  # converts to lowercase
              )
    
print("After cleaning:")
print(df['sex'].unique(), "\n")
print(df['site'].unique(), "\n")
print(df['intervention_group'].unique(), "\n")
print(df['outcome_cvd'].unique(), "\n")
print(df['dropout'].unique(), "\n")

# replace "m" and "f" with "male" and "female"
df['sex'] = df['sex'].replace({'m': 'male', 'f': 'female'})

print("Final check:")
print(df['sex'].unique(), "\n")
print(df['site'].unique(), "\n")
print(df['intervention_group'].unique(), "\n")
print(df['outcome_cvd'].unique(), "\n")
print(df['dropout'].unique(), "\n")


Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,bmi,enrollment_date,systolic_bp,diastolic_bp,cholesterol_total,cholesterol_hdl,cholesterol_ldl,glucose_fasting,site,intervention_group,follow_up_months,adverse_events,outcome_cvd,adherence_pct,dropout,cholesterol_ratio,bp_category,age_group,bmi_category
0,0,P00001,80.0,F,29.3,2022-05-01,123.0,80.0,120.0,55.0,41.0,118.0,site b,control,20,0,No,24.0,No,0.745455,Elevated,70+,Overweight
1,1,P00002,80.0,Female,26.2,2022-01-06,139.0,81.0,206.0,58.0,107.0,79.0,site a,control,24,0,No,77.0,No,1.844828,High,70+,Overweight
2,2,P00003,82.0,Female,26.2,2023-11-04,123.0,86.0,172.0,56.0,82.0,77.0,site c,treatment b,2,0,Yes,70.0,No,1.464286,Elevated,70+,Overweight
3,3,P00004,95.0,Female,25.4,2022-08-15,116.0,77.0,200.0,56.0,104.0,115.0,site d,treatment b,17,0,No,62.0,No,1.857143,Normal,70+,Overweight
4,4,P00005,95.0,M,26.2,2023-04-17,97.0,71.0,185.0,78.0,75.0,113.0,site e,treatment a,9,0,yes,62.0,Yes,0.961538,Normal,70+,Overweight
5,5,P00006,78.0,F,26.8,2023-08-29,116.0,66.0,164.0,54.0,99.0,99.0,site a,treatment a,4,0,yes,62.0,Yes,1.833333,Normal,70+,Overweight
6,6,P00007,84.0,F,25.4,2022-05-12,133.0,100.0,215.0,62.0,113.0,70.0,site a,treatment a,20,1,No,76.0,No,1.822581,High,70+,Overweight
7,7,P00008,70.0,Male,24.7,2022-06-04,111.0,72.0,174.0,60.0,94.0,109.0,site b,treatment a,19,0,No,53.0,No,1.566667,Normal,55-69,Normal
8,8,P00009,92.0,Female,26.9,2022-04-06,117.0,73.0,189.0,62.0,89.0,103.0,site a,control,21,0,yes,53.0,No,1.435484,Normal,70+,Overweight
9,9,P00010,75.0,Male,21.1,2023-12-14,128.0,76.0,218.0,77.0,97.0,96.0,site a,treatment b,1,0,No,50.0,No,1.25974,Elevated,70+,Normal


Before cleaning: 

['F', '  Female  ', 'Female', 'M', 'Male', '  M  ', '  F  ', '  Male  ']
Categories (8, object): ['  F  ', '  Female  ', '  M  ', '  Male  ', 'F', 'Female', 'M', 'Male'] 

['site b', 'site a', 'site c', 'site d', 'site e']
Categories (5, object): ['site a', 'site b', 'site c', 'site d', 'site e'] 

['control', 'treatment b', 'treatment a']
Categories (3, object): ['control', 'treatment a', 'treatment b'] 

['No' 'Yes' 'yes' 'no'] 

['No' 'Yes']
After cleaning:
['f' 'female' 'm' 'male'] 

['site b', 'site a', 'site c', 'site d', 'site e']
Categories (5, object): ['site a', 'site b', 'site c', 'site d', 'site e'] 

['control', 'treatment b', 'treatment a']
Categories (3, object): ['control', 'treatment a', 'treatment b'] 

['no' 'yes'] 

['no' 'yes'] 

Final check:
['female' 'male'] 

['site b', 'site a', 'site c', 'site d', 'site e']
Categories (5, object): ['site a', 'site b', 'site c', 'site d', 'site e'] 

['control', 'treatment b', 'treatment a']
Categories (3, ob

## Part 4: One-Hot Encoding (5 points)

Create dummy variables for categorical columns:
1. One-hot encode 'intervention_group' using `pd.get_dummies()`
2. One-hot encode 'site'
3. Drop the original categorical columns
4. Show the new shape and column names

In [17]:
# TODO: One-hot encoding
intervention_dummies = pd.get_dummies(df['intervention_group'], prefix = "intervention", drop_first = True, dtype='int64')
site_dummies = pd.get_dummies(df['site'], prefix = "site", drop_first = True, dtype='int64')
df = pd.concat([df, intervention_dummies, site_dummies], axis = 1)

df.to_csv('data/q6_transformed_data_no_drop_site_or_intervention.csv', index=False)

df = df.drop(['intervention_group', 'site'], axis = 1)
print("After one-hot encoding:")
print(df.shape)
print(df.columns)
display(df.head(5))

After one-hot encoding:
(8328, 27)
Index(['Unnamed: 0', 'patient_id', 'age', 'sex', 'bmi', 'enrollment_date',
       'systolic_bp', 'diastolic_bp', 'cholesterol_total', 'cholesterol_hdl',
       'cholesterol_ldl', 'glucose_fasting', 'follow_up_months',
       'adverse_events', 'outcome_cvd', 'adherence_pct', 'dropout',
       'cholesterol_ratio', 'bp_category', 'age_group', 'bmi_category',
       'intervention_treatment a', 'intervention_treatment b', 'site_site b',
       'site_site c', 'site_site d', 'site_site e'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,patient_id,age,sex,bmi,enrollment_date,systolic_bp,diastolic_bp,cholesterol_total,cholesterol_hdl,cholesterol_ldl,glucose_fasting,follow_up_months,adverse_events,outcome_cvd,adherence_pct,dropout,cholesterol_ratio,bp_category,age_group,bmi_category,intervention_treatment a,intervention_treatment b,site_site b,site_site c,site_site d,site_site e
0,0,P00001,80.0,female,29.3,2022-05-01,123.0,80.0,120.0,55.0,41.0,118.0,20,0,no,24.0,no,0.745455,Elevated,70+,Overweight,0,0,1,0,0,0
1,1,P00002,80.0,female,26.2,2022-01-06,139.0,81.0,206.0,58.0,107.0,79.0,24,0,no,77.0,no,1.844828,High,70+,Overweight,0,0,0,0,0,0
2,2,P00003,82.0,female,26.2,2023-11-04,123.0,86.0,172.0,56.0,82.0,77.0,2,0,yes,70.0,no,1.464286,Elevated,70+,Overweight,0,1,0,1,0,0
3,3,P00004,95.0,female,25.4,2022-08-15,116.0,77.0,200.0,56.0,104.0,115.0,17,0,no,62.0,no,1.857143,Normal,70+,Overweight,0,1,0,0,1,0
4,4,P00005,95.0,male,26.2,2023-04-17,97.0,71.0,185.0,78.0,75.0,113.0,9,0,yes,62.0,yes,0.961538,Normal,70+,Overweight,1,0,0,0,0,1


## Part 5: Save Transformed Data

Save the fully transformed dataset to `output/q6_transformed_data.csv`

In [18]:
# TODO: Save transformed data
# df_transformed.to_csv('output/q6_transformed_data.csv', index=False)
df_transformed = df
df_transformed.to_csv('output/q6_transformed_data.csv', index=False)