<a href="https://colab.research.google.com/github/SeenaKhosravi/NASS/blob/main/NASS_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Socioecnomic and Demographic Drivers of Ambulatory Surgery Usage
### HCUP NASS 2020 – Reproducible Pipeline (Python + R)

Author : Seena Khosravi, MD   •   Last build : `9-1-2025`

DUA compliant — **use simulated file unless you have purchased dataset from HCUP. Will load full dataset from your Google Drive**

-----------------------------------------------------------------
##Design notes
#### > Python cells handle “plumbing” (file I/O, drive-mounting, rpy2 set-up, small pandas previews).
#### > R cells (prefixed by `%%R`) reproduce/extend your original analysis: survey weights, Census look-ups, multilevel models, plots, etc.
#### > The notebook runs against the public, 1 GB simulated file at
  https://github.com/SeenaKhosravi/NASS/releases/download/v1.0.0/nass_2020_simulated.csv
#### > A single flag lets you switch to a locally mounted Drive copy of the full 7.8 M-row HCUP release.
#### > All package installs are isolated in their respective language cells; data.table syntax follows current recommendations [cran.r-project.org](https://cran.r-project.org/web/packages/data.table/refman/data.table.html).
-----------------------------------------------------------------


-----------------------------------------------------------------
# 2  Runtime selector (Python)

In [2]:
#@title ▶️ CONFIG — choose data source & toggle verbose mode
USE_DRIVE      = False  # True ➜ mount Google Drive and read full HCUP files
VERBOSE_PRINTS = True   # False ➜ suppress head()/str() previews

-----------------------------------------------------------------
# 3  Optional Drive mount (Python)

In [None]:
if USE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

-----------------------------------------------------------------
# 4  Python | Grab the CSV (streaming-friendly)

In [3]:
import pathlib, subprocess, textwrap, os, pandas as pd, numpy as np, gzip, shutil, sys, json, io, requests

DATA_PATH = pathlib.Path('/content') / ('nass_2020_simulated.csv' if not USE_DRIVE else
                                        'drive/MyDrive/hcup/nass_2020_full.csv')

if not DATA_PATH.exists() and not USE_DRIVE:
    url = "https://github.com/SeenaKhosravi/NASS/releases/download/v1.0.0/nass_2020_simulated.csv"
    !wget -q --show-progress -O "{DATA_PATH}" "{url}"

print(f"✔️ Data ready → {DATA_PATH}")

✔️ Data ready → /content/nass_2020_simulated.csv


-----------------------------------------------------------------
# 5  Python | Show 5 rows to verify

In [4]:
sample_df = pd.read_csv(DATA_PATH, nrows=5)
if VERBOSE_PRINTS: display(sample_df)

Unnamed: 0,KEY_NASS,HOSP_NASS,HOSP_TEACH,HOSP_LOCATION,HOSP_LOCTEACH,HOSP_REGION,HOSP_BEDSIZE_CAT,DISCWT,NASS_STRATUM,N_DISC_U,...,DXCCSR_SYM011,DXCCSR_SYM012,DXCCSR_SYM013,DXCCSR_SYM014,DXCCSR_SYM015,DXCCSR_SYM016,DXCCSR_SYM017,DXCCSR_VERSION,AGEGRP,AGEGRP2
0,90000001,10059,1,1,3,1,3,1.28818,58,381183,...,0,0,0,0,0,0,0,2022.1,0-17,0-17
1,90000002,21017,1,1,3,2,3,1.040243,17,984415,...,0,0,0,0,0,0,0,2022.1,65+,65-69
2,90000003,20184,1,1,3,2,3,1.040243,17,984415,...,0,0,0,0,0,0,0,2022.1,18-64,55-64
3,90000004,30860,0,1,2,3,1,1.689489,27,122282,...,0,0,0,0,0,0,0,2022.1,65+,65-69
4,90000005,20420,1,1,3,2,3,1.040243,17,984415,...,0,0,0,0,0,0,0,2022.1,65+,80+


-----------------------------------------------------------------
# 6  Load rpy2 (Python)

In [5]:
%load_ext rpy2.ipython

-----------------------------------------------------------------
# 7  R | Package install (one-time) and load

In [None]:
%%R
req_pkgs <- c("data.table","survey","dplyr","tidyverse","tidycensus",
              "ggplot2","gridExtra","pROC","broom","lme4")
new <- req_pkgs[!req_pkgs %in% installed.packages()[,"Package"]]
if(length(new)) install.packages(new, repos = "https://cloud.r-project.org")

# Check which packages were successfully installed and load them
installed <- installed.packages()[,"Package"]
loaded_pkgs <- c()
failed_pkgs <- c()
for (pkg in req_pkgs) {
  if (pkg %in% installed) {
    library(pkg, character.only = TRUE)
    loaded_pkgs <- c(loaded_pkgs, pkg)
  } else {
    failed_pkgs <- c(failed_pkgs, pkg)
  }
}

if (length(failed_pkgs) > 0) {
  cat("Warning: The following packages failed to install and were not loaded:", paste(failed_pkgs, collapse = ", "), "\n")
}

-----------------------------------------------------------------
# 8  R | Read the CSV with data.table (fast)

In [None]:
%%R -i DATA_PATH -i VERBOSE_PRINTS
options(datatable.print.nrows = 10)

NASS <- fread(DATA_PATH)
if (VERBOSE_PRINTS) print(NASS[1:10])

# Light type coercion
# (see data.table v1.15+: we can refer to external names with .. prefix [cran.r-project.org](https://cran.r-project.org/web/packages/data.table/refman/data.table.html) )
num_cols  <- c("AGE","DISCWT","TOTCHG","TOTAL_AS_ENCOUNTERS")
NASS[, (num_cols) := lapply(.SD, as.numeric), .SDcols = num_cols]

# Boolean helper
NASS[, WHITE := fifelse(RACE == 1, 1, 0)]

-----------------------------------------------------------------
# 9  R | Replicate headline counts

In [None]:
%%R
cat("Rows:", nrow(NASS), "  Cols:", ncol(NASS), "\n")
top10 <- NASS[, .N, by = CPTCCS1][order(-N)][1:10]
knitr::kable(top10, caption = "Top 10 CPTCCS1 counts (simulated)")

-----------------------------------------------------------------
# 10  R | Income quartile vs procedure barplot

In [None]:
%%R
top_codes <- top10$CPTCCS1
plt_income <- NASS[CPTCCS1 %in% top_codes] |>
  ggplot(aes(x = fct_infreq(CPTCCS1), fill = ZIPINC_QRTL)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  labs(y = "Share within CPT", x = "CPTCCS1", fill = "ZIP Quartile",
       title = "Income distribution within 10 most-common procedures")
print(plt_income)

-----------------------------------------------------------------
# 11  Python | Environment variable for Census API

In [None]:
import getpass, os, json, textwrap
os.environ["CENSUS_API_KEY"] = getpass.getpass("Enter your Census API key (will not echo):")

-----------------------------------------------------------------
# 12  R | Set Census key & pull 2020 DHC totals

In [None]:
%%R -i states_in_nass=character() -i VERBOSE_PRINTS
# If you've already installed the key once, this is a no-op
tidycensus::census_api_key(Sys.getenv("CENSUS_API_KEY"), overwrite = FALSE, install = FALSE)

get_vars <- function(base) sprintf("%s_%03dN", base, 1:49)

vars_total <- get_vars("P12")
vars_white <- get_vars("P12I")

pull_state_totals <- function(vars){
  get_decennial(geography = "state",
                variables = vars,
                year = 2020, sumfile = "dhc") |>
  group_by(NAME) |> summarise(total = sum(value))
}

total_pop  <- pull_state_totals(vars_total)
white_pop  <- pull_state_totals(vars_white)

census_prop <- merge(total_pop, white_pop, by = "NAME",
                     suffixes = c("_all","_white"))
census_prop[, prop_white := total_white / total_all]

if (VERBOSE_PRINTS) head(census_prop)

-----------------------------------------------------------------
# 13  R | Weighted vs unweighted proportion test

In [None]:
%%R
library(survey)

# Survey design using provided discharge weight
des <- svydesign(ids = ~1, weights = ~DISCWT, data = NASS)

unweighted_hat <- mean(NASS$WHITE)
weighted_hat   <- svymean(~WHITE, des)[1]

us_prop <- weighted.mean(census_prop$prop_white,
                         w = census_prop$total_all)

cat(sprintf("Unweighted NASS white %%: %.3f\n", unweighted_hat))
cat(sprintf("Weighted   NASS white %%: %.3f\n", weighted_hat))
cat(sprintf("2020 Census (all NASS states) white %%: %.3f\n", us_prop))

svytest <- svyciprop(~WHITE, des,
                     method = "likelihood", level = 0.95)
print(svytest)

-----------------------------------------------------------------
# 14  R | Age-by-sex plot vs Census (adapted from `agesociodiv.r`)

In [None]:
%%R
age_breaks <- c(-Inf,4,9,14,17,19,20,21,24,29,34,39,44,49,54,59,61,64,
                66,69,74,79,84,Inf)
age_labels <- c("U5","5-9","10-14","15-17","18-19","20","21",
                "22-24","25-29","30-34","35-39","40-44","45-49",
                "50-54","55-59","60-61","62-64","65-66","67-69",
                "70-74","75-79","80-84","85+")

NASS[, AGE_GROUP := cut(AGE, breaks = age_breaks,
                        labels = age_labels, right = TRUE)]

plot_df <- NASS[, .(white = sum(WHITE),
                    n     = .N),
                by = .(SEX = factor(FEMALE, labels=c("Male","Female")),
                       AGE_GROUP)]
plot_df[, prop := white/n]

gg_gender <- ggplot(plot_df, aes(x = AGE_GROUP, y = prop,
                                 group = SEX, color = SEX)) +
  geom_line(linewidth=1) +
  geom_point() +
  scale_y_continuous(labels = scales::percent) +
  labs(y = "% White (NASS, simulated)", x = "Age-group",
       title = "Crude white proportion by age & sex") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle=45, hjust=1))
print(gg_gender)

-----------------------------------------------------------------
# 15  R | Multilevel logistic models (hospital nested, 3 tiers)

In [None]:
%%R
features <- NASS[, .(WHITE,
                     FEMALE,
                     ZIPINC_QRTL,
                     PAY1,
                     CPTCCS1,
                     HOSP_LOCATION,
                     HOSP_TEACH,
                     HOSP_NASS)]

features[, c(names(features)) := lapply(.SD, as.factor)]

formulas <- list(
  m1 = WHITE ~ FEMALE + (1|HOSP_NASS),
  m2 = WHITE ~ FEMALE + ZIPINC_QRTL + (1|HOSP_NASS),
  m3 = WHITE ~ FEMALE + ZIPINC_QRTL + PAY1 + CPTCCS1 +
                    HOSP_LOCATION + HOSP_TEACH + (1|HOSP_NASS)
)

fit <- lapply(formulas, glmer, family = binomial, data = features,
              control = glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e4)))

sapply(fit, function(m) broom::tidy(m, effects = "fixed")[1:5,])

-----------------------------------------------------------------
# 16  R | Compare AUC across the three models

In [None]:
%%R
library(pROC)
auc_vals <- sapply(fit, function(m){
  preds <- predict(m, type="response")
  roc(features$WHITE, preds)$auc
})
knitr::kable(data.frame(model = names(auc_vals), AUC = auc_vals),
             caption = "AUC (in-sample, simulated data)")

-----------------------------------------------------------------
# 17  Python | Teardown helper (optional)

In [None]:
if not USE_DRIVE:
    print("Done ✅ — runtime will auto-delete downloaded CSV when session ends.")