In [1]:
# Load dataset:
df_support <- read.csv("support2.csv", header = TRUE, sep = ",")

df_description <- read.csv("support-variables-description.csv", header = TRUE, sep = ",")


In [2]:
# Inspect dataset
head(df_support)


age,death,sex,hospdead,slos,d.time,dzgroup,dzclass,num.co,edu,...,crea,sod,ph,glucose,bun,urine,adlp,adls,sfdm2,adlsc
62.84998,0,male,0,5,2029,Lung Cancer,Cancer,0,11.0,...,1.1999512,141,7.459961,,,,7.0,7,,7
60.33899,1,female,1,4,4,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,...,5.5,132,7.25,,,,,1,<2 mo. follow-up,1
52.74698,1,female,0,17,47,Cirrhosis,COPD/CHF/Cirrhosis,2,12.0,...,2.0,134,7.459961,,,,1.0,0,<2 mo. follow-up,0
42.38498,1,female,0,3,133,Lung Cancer,Cancer,2,11.0,...,0.7999268,139,,,,,0.0,0,no(M2 and SIP pres),0
79.88495,0,female,0,16,2029,ARF/MOSF w/Sepsis,ARF/MOSF,1,,...,0.7999268,143,7.509766,,,,,2,no(M2 and SIP pres),2
93.01599,1,male,1,4,4,Coma,Coma,1,14.0,...,0.6999512,140,7.65918,,,,,1,<2 mo. follow-up,1


In [3]:
# Check structure of dataset
str(df_support)
dim(df_support)


'data.frame':	9105 obs. of  47 variables:
 $ age     : num  62.8 60.3 52.7 42.4 79.9 ...
 $ death   : int  0 1 1 1 0 1 1 1 1 1 ...
 $ sex     : Factor w/ 2 levels "female","male": 2 1 1 1 1 2 2 2 2 1 ...
 $ hospdead: int  0 1 0 0 0 1 0 0 0 0 ...
 $ slos    : int  5 4 17 3 16 4 9 7 12 8 ...
 $ d.time  : int  2029 4 47 133 2029 4 659 142 63 370 ...
 $ dzgroup : Factor w/ 8 levels "ARF/MOSF w/Sepsis",..: 7 3 3 7 1 5 2 2 7 4 ...
 $ dzclass : Factor w/ 4 levels "ARF/MOSF","Cancer",..: 2 4 4 2 1 3 4 4 2 2 ...
 $ num.co  : int  0 2 2 2 1 1 1 3 2 0 ...
 $ edu     : int  11 12 12 11 NA 14 14 NA 12 11 ...
 $ income  : Factor w/ 5 levels "","$11-$25k",..: 2 2 5 5 1 1 3 1 1 3 ...
 $ scoma   : int  0 44 0 0 26 55 0 26 26 0 ...
 $ charges : num  9715 34496 41094 3075 50127 ...
 $ totcst  : num  NA NA NA NA NA NA NA NA NA NA ...
 $ totmcst : num  NA NA NA NA NA NA NA NA NA NA ...
 $ avtisst : num  7 29 13 7 18.7 ...
 $ race    : Factor w/ 6 levels "","asian","black",..: 5 6 6 6 6 6 6 6 3 4 ...
 $ sps

In [4]:
# Group the categorical variables
cat_interest <- c("sex", "death", "hospdead", "dzgroup", "dzclass", "income", "race", "diabetes", "dementia", "ca", "dnr", "adlp", "adls", "sfdm2")

# Apply unique function to validate categorical variables and check their structure
unique_values <- sapply(df_support[cat_interest], unique)
unique_values

str(unique_values)


List of 14
 $ sex     : Factor w/ 2 levels "female","male": 2 1
 $ death   : int [1:2] 0 1
 $ hospdead: int [1:2] 0 1
 $ dzgroup : Factor w/ 8 levels "ARF/MOSF w/Sepsis",..: 7 3 1 5 2 4 6 8
 $ dzclass : Factor w/ 4 levels "ARF/MOSF","Cancer",..: 2 4 1 3
 $ income  : Factor w/ 5 levels "","$11-$25k",..: 2 5 1 3 4
 $ race    : Factor w/ 6 levels "","asian","black",..: 5 6 3 4 2 1
 $ diabetes: int [1:2] 0 1
 $ dementia: int [1:2] 0 1
 $ ca      : Factor w/ 3 levels "metastatic","no",..: 1 2 3
 $ dnr     : Factor w/ 4 levels "","dnr after sadm",..: 4 1 2 3
 $ adlp    : int [1:9] 7 NA 1 0 2 3 5 6 4
 $ adls    : int [1:9] 7 1 0 2 NA 5 4 6 3
 $ sfdm2   : Factor w/ 6 levels "","<2 mo. follow-up",..: 1 2 5 6 3 4


In [5]:
# After Inspecting closely, We see that some categorical variables datatype are not factors eg. $ sex : chr  "male" "female" # nolint (? check if num.co is categorical or ordinal)


In [6]:
# convert categorical and ordinal variables to factors  # nolint
df_support$sex <- factor(df_support$sex)

df_support$death <- factor(df_support$death)

df_support$hospdead <- factor(df_support$hospdead)

df_support$dzgroup <- factor(df_support$dzgroup)

df_support$dzclass <- factor(df_support$dzclass)

df_support$income <- factor(df_support$income, ordered = TRUE, levels = c("under $11k", "$11-$25k", "$25-$50k", ">$50k"))

df_support$race <- factor(df_support$race)

df_support$diabetes <- factor(df_support$diabetes)

df_support$dementia <- factor(df_support$dementia)

factor(c("metastatic", "no", "yes"),
    levels = c("no", "yes", "metastatic"),
    ordered = TRUE
)

df_support$adlp <- factor(df_support$adlp, ordered = TRUE, levels = c(1, 2, 3, 4, 5, 6, 7))

df_support$adls <- factor(df_support$adls, ordered = TRUE, levels = c(1, 2, 3, 4, 5, 6, 7))

df_support$sfdm2 <- factor(df_support$sfdm2, ordered = TRUE, levels = c("no(M2 and SIP pres)", "adl>=4 (>=5 if sur)", "SIP>=30", "Coma or Intub", "<2 mo. follow-up"))


In [9]:
# Check structure of dataset to confirm changes
str(df_support[cat_interest])


'data.frame':	9105 obs. of  14 variables:
 $ sex     : Factor w/ 2 levels "female","male": 2 1 1 1 1 2 2 2 2 1 ...
 $ death   : Factor w/ 2 levels "0","1": 1 2 2 2 1 2 2 2 2 2 ...
 $ hospdead: Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 1 ...
 $ dzgroup : Factor w/ 8 levels "ARF/MOSF w/Sepsis",..: 7 3 3 7 1 5 2 2 7 4 ...
 $ dzclass : Factor w/ 4 levels "ARF/MOSF","Cancer",..: 2 4 4 2 1 3 4 4 2 2 ...
 $ income  : Ord.factor w/ 4 levels "under $11k"<"$11-$25k"<..: 2 2 1 1 NA NA 3 NA NA 3 ...
 $ race    : Factor w/ 6 levels "","asian","black",..: 5 6 6 6 6 6 6 6 3 4 ...
 $ diabetes: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
 $ dementia: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
 $ ca      : Factor w/ 3 levels "metastatic","no",..: 1 2 2 1 2 2 2 2 1 1 ...
 $ dnr     : Factor w/ 4 levels "","dnr after sadm",..: 4 1 4 4 4 4 4 4 2 4 ...
 $ adlp    : Ord.factor w/ 7 levels "1"<"2"<"3"<"4"<..: 7 NA 1 NA NA NA NA NA NA NA ...
 $ adls    : Ord.factor w/ 7 levels "1"<"2"<"3"<

In [None]:
# MISSING VALUES ANALYSIS

In [18]:
# Counting complete cases
num_complete_cases <- sum(complete.cases(df_support))
num_complete_cases


In [14]:
# Check for Missing Values:
sum(is.na(df_support))

# Check for Missing Values in each column:
na_counts_col <- colSums(is.na(df_support))
na_counts_col

