# Transform data
In this notebook we perform transformations required to get required features for modelling.

In [0]:
# Set up dependencies
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.ml.feature import Bucketizer
from pyspark.sql.functions import when, col, count, isnan
from pyspark.sql import DataFrame

In [0]:
spark = SparkSession.builder.appName("PopHealthRisk").getOrCreate()

In [0]:
df = spark.read.parquet('/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/LLCP2024.parquet', header=True, inferSchema=True)

# Transform data

In [0]:
# create RISK level
bucketizer = Bucketizer(splits=[ 0,1,5,31, float('Inf') ],inputCol="POORHLTH", outputCol="HLTHRISK")
df=bucketizer.setHandleInvalid("keep").transform(df)

In [0]:
condition_cols=['CVDINFR4','CVDCRHD4','CVDSTRK3','ASTHMA3','CHCSCNC1','CHCOCNC1','CHCCOPD3','ADDEPEV3','CHCKDNY2','HAVARTH4','DIABETE4']
df = df.withColumn(
    'num_conditions',
    sum(
        when(col(c) == 1, 1).otherwise(0) # count conditions that are listed as 1
        for c in condition_cols
    )
)

In [0]:
df = df.withColumn(
    'CHILDREN_mod',
    when(df['CHILDREN'] == 88, 0).otherwise(
        when(df['CHILDREN'] == 99, np.nan).otherwise(
            when(df['CHILDREN'] >8, 8).otherwise(df['CHILDREN'])
        )
    )
)

In [0]:
df = df.withColumn(
    'ADULT_mod',
    when(df['HHADULT'].isin([0,77,88,99]), np.nan).otherwise(
        when(df['HHADULT']>9, 9).otherwise(df['HHADULT']))
)

In [0]:
poverty_df = pd.read_csv('/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/thresh24.csv')
poverty_df=poverty_df.rename(columns={'household':'ADULT_mod', 'children':'CHILDREN_mod','threshold':'poverty_threshold'})
bins=[0,10000,15000,20000,25000,35000,50000,75000,100000,150000,200000,np.inf]
poverty_df['poverty_threshold_conv']=pd.cut(poverty_df['poverty_threshold'],bins, labels=[x for x in range(1,12)])
poverty_df['poverty_threshold_conv']=pd.factorize(poverty_df['poverty_threshold_conv'])[0] + 2
poverty_df = spark.createDataFrame(poverty_df) # convert to pyspark df for join

In [0]:
df = df.join(poverty_df, on=['ADULT_mod','CHILDREN_mod'], how='left')

In [0]:
df = df.withColumn(
    'INCOME3_mod',
    when(df['INCOME3'].isin([77,99]), np.nan).otherwise(df['INCOME3'])
)

In [0]:
df = df.withColumn(
    'income_adj_pov',
    df['INCOME3_mod']-df['poverty_threshold_conv']
)

# Select features and filter

In [0]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in ['INCOME3_mod','poverty_threshold_conv','HHADULT','CHILDREN','INCOME3','INCOME3_mod']]).show()

In [0]:
df_min=df[["_AGEG5YR","EDUCA",'_BMI5','num_conditions','income_adj_pov','HLTHRISK']]
df_min.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df_min.columns]
   ).show()

In [0]:
df_min.count()

In [0]:
df_min.na.drop().count()

We lose a significant proportion of rows (2/3) when we require all values to be present. The 2 major contributors are HLTHRISK and income_adj_pov, which mostly derives from missing INCOME3 values. HLTHRISK is the target variable and we expect income to be a key input to the model so we will leave for now. A more thorough treatment could involved trying to impute income or using other columns like GENHEALTH to inform HLTRISK.

In [0]:
df_min=df_min.na.drop()