In [5]:
import pandas as pd

In [6]:
data = pd.read_csv("./data/MEPS_data_preprocessed.csv")
data.drop(columns = ["PANEL", "PERSONWT"], inplace = True)

In [7]:
print(f"Number of rows: {data.shape[0]}")
print(f"Number of columns: {data.shape[1]}")

Number of rows: 18350
Number of columns: 44


In [8]:
print(f"Categories: {data.columns.values}")

Categories: ['REGION' 'AGE31X' 'GENDER' 'RACE3' 'MARRY31X' 'EDRECODE' 'FTSTU31X'
 'ACTDTY31' 'HONRDC31' 'RTHLTH31' 'MNHLTH31' 'HIBPDX' 'CHDDX' 'ANGIDX'
 'MIDX' 'OHRTDX' 'STRKDX' 'EMPHDX' 'CHBRON31' 'CHOLDX' 'CANCERDX' 'DIABDX'
 'JTPAIN31' 'ARTHDX' 'ARTHTYPE' 'ASTHDX' 'ADHDADDX' 'PREGNT31' 'WLKLIM31'
 'ACTLIM31' 'SOCLIM31' 'COGLIM31' 'DFHEAR42' 'DFSEE42' 'ADSMOK42' 'PCS42'
 'MCS42' 'K6SUM42' 'PHQ242' 'EMPST31' 'POVCAT15' 'INSCOV15' 'INCOME_M'
 'HEALTHEXP']


In [67]:
import plotly.express as px
import plotly.graph_objects as go

plot_style = {
    "plot_bgcolor": "rgba(0,0,0,0)",
    "paper_bgcolor": "rgba(0,0,0,0)",
    "title": {
        "x": 0.5,
        "xanchor": "center",
        "yanchor": "top",
    },
    "font": {
        "color": "rgba(20,20,20,1)",
        "size": 18
    },
    "xaxis": {
        "gridcolor": "rgba(20,20,20,1)",
        "color": "rgba(20,20,20,1)",
    },
    "yaxis": {
        "gridcolor": "rgba(20,20,20,1)",
        "color": "rgba(20,20,20,1)",
    },
}

In [70]:
def plot_boxplot(df, column: str, title: str):
    fig = px.box(df, x=column, title=title)
    fig.update_layout(plot_style)
    fig.show()

In [71]:
plot_boxplot(data, "HEALTHEXP", 'Health Expenditure')

In [68]:
def plot_histogram(df, column: str, title: str):
    fig = px.histogram(df, x = column, title = title, log_y = True)
    fig.update_layout(plot_style)
    fig.update_yaxes(title_text = "Number of People")
    fig.update_xaxes(range = [0, 250000])
    # update color of grid
    fig.update_traces(marker_color = "#7d5ffe")
    fig.show()

In [69]:
plot_histogram(data, "HEALTHEXP", 'Health Expenditure')

In [14]:
def plot_correlation_matrix(df, title: str):
    fig = go.Figure(data=go.Heatmap(
        z = df.corr(),
        x = df.columns,
        y = df.columns,
        colorscale = 'Purples',
    ))
    # figure size
    fig.update_layout(
        width = 2000,
        height = 2000,
    )
    fig.update_layout(plot_style)
    fig.update_layout(title = title)
    fig.show()

In [15]:
plot_correlation_matrix(data, "Correlation Matrix")

In [16]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.3, random_state = 123)

train.to_csv("./data/MEPS_data_preprocessed_train.csv", index = False)
test.to_csv("./data/MEPS_data_preprocessed_test.csv", index = False)