Permalink
Cannot retrieve contributors at this time
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
242 lines (205 sloc)
8.69 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(cowplot) | |
## NOTE: The data used in this demo comes from the UCI machine learning | |
## repository. | |
## http://archive.ics.uci.edu/ml/index.php | |
## Specifically, this is the heart disease data set. | |
## http://archive.ics.uci.edu/ml/datasets/Heart+Disease | |
url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data" | |
data <- read.csv(url, header=FALSE) | |
##################################### | |
## | |
## Reformat the data so that it is | |
## 1) Easy to use (add nice column names) | |
## 2) Interpreted correctly by glm().. | |
## | |
##################################### | |
head(data) # you see data, but no column names | |
colnames(data) <- c( | |
"age", | |
"sex",# 0 = female, 1 = male | |
"cp", # chest pain | |
# 1 = typical angina, | |
# 2 = atypical angina, | |
# 3 = non-anginal pain, | |
# 4 = asymptomatic | |
"trestbps", # resting blood pressure (in mm Hg) | |
"chol", # serum cholestoral in mg/dl | |
"fbs", # fasting blood sugar if less than 120 mg/dl, 1 = TRUE, 0 = FALSE | |
"restecg", # resting electrocardiographic results | |
# 1 = normal | |
# 2 = having ST-T wave abnormality | |
# 3 = showing probable or definite left ventricular hypertrophy | |
"thalach", # maximum heart rate achieved | |
"exang", # exercise induced angina, 1 = yes, 0 = no | |
"oldpeak", # ST depression induced by exercise relative to rest | |
"slope", # the slope of the peak exercise ST segment | |
# 1 = upsloping | |
# 2 = flat | |
# 3 = downsloping | |
"ca", # number of major vessels (0-3) colored by fluoroscopy | |
"thal", # this is short of thalium heart scan | |
# 3 = normal (no cold spots) | |
# 6 = fixed defect (cold spots during rest and exercise) | |
# 7 = reversible defect (when cold spots only appear during exercise) | |
"hd" # (the predicted attribute) - diagnosis of heart disease | |
# 0 if less than or equal to 50% diameter narrowing | |
# 1 if greater than 50% diameter narrowing | |
) | |
head(data) # now we have data and column names | |
str(data) # this shows that we need to tell R which columns contain factors | |
# it also shows us that there are some missing values. There are "?"s | |
# in the dataset. These are in the "ca" and "thal" columns... | |
## First, convert "?"s to NAs... | |
data[data == "?"] <- NA | |
## Now add factors for variables that are factors and clean up the factors | |
## that had missing data... | |
data[data$sex == 0,]$sex <- "F" | |
data[data$sex == 1,]$sex <- "M" | |
data$sex <- as.factor(data$sex) | |
data$cp <- as.factor(data$cp) | |
data$fbs <- as.factor(data$fbs) | |
data$restecg <- as.factor(data$restecg) | |
data$exang <- as.factor(data$exang) | |
data$slope <- as.factor(data$slope) | |
data$ca <- as.integer(data$ca) # since this column had "?"s in it | |
# R thinks that the levels for the factor are strings, but | |
# we know they are integers, so first convert the strings to integers... | |
data$ca <- as.factor(data$ca) # ...then convert the integers to factor levels | |
data$thal <- as.integer(data$thal) # "thal" also had "?"s in it. | |
data$thal <- as.factor(data$thal) | |
## This next line replaces 0 and 1 with "Healthy" and "Unhealthy" | |
data$hd <- ifelse(test=data$hd == 0, yes="Healthy", no="Unhealthy") | |
data$hd <- as.factor(data$hd) # Now convert to a factor | |
str(data) ## this shows that the correct columns are factors | |
## Now determine how many rows have "NA" (aka "Missing data"). If it's just | |
## a few, we can remove them from the dataset, otherwise we should consider | |
## imputing the values with a Random Forest or some other imputation method. | |
nrow(data[is.na(data$ca) | is.na(data$thal),]) | |
data[is.na(data$ca) | is.na(data$thal),] | |
## so 6 of the 303 rows of data have missing values. This isn't a large | |
## percentage (2%), so we can just remove them from the dataset | |
## NOTE: This is different from when we did machine learning with | |
## Random Forests. When we did that, we imputed values. | |
nrow(data) | |
data <- data[!(is.na(data$ca) | is.na(data$thal)),] | |
nrow(data) | |
##################################### | |
## | |
## Now we can do some quality control by making sure all of the factor | |
## levels are represented by people with and without heart disease (hd) | |
## | |
## NOTE: We also want to exclude variables that only have 1 or 2 samples in | |
## a category since +/- one or two samples can have a large effect on the | |
## odds/log(odds) | |
## | |
## | |
##################################### | |
xtabs(~ hd + sex, data=data) | |
xtabs(~ hd + cp, data=data) | |
xtabs(~ hd + fbs, data=data) | |
xtabs(~ hd + restecg, data=data) | |
xtabs(~ hd + exang, data=data) | |
xtabs(~ hd + slope, data=data) | |
xtabs(~ hd + ca, data=data) | |
xtabs(~ hd + thal, data=data) | |
##################################### | |
## | |
## Now we are ready for some logistic regression. First we'll create a very | |
## simple model that uses sex to predict heart disease | |
## | |
##################################### | |
## let's start super simple and see if sex (female/male) is a good | |
## predictor... | |
## First, let's just look at the raw data... | |
xtabs(~ hd + sex, data=data) | |
# sex | |
# hd F M | |
# Healthy 71 89 | |
# Unhealthy 25 112 | |
## Most of the females are healthy and most of the males are unhealthy. | |
## Being female is likely to decrease the odds in being unhealthy. | |
## In other words, if a sample is female, the odds are against it that it | |
## will be unhealthy | |
## Being male is likely to increase the odds in being unhealthy... | |
## In other words, if a sample is male, the odds are for it being unhealthy | |
########### | |
## | |
## Now do the actual logistic regression | |
## | |
########### | |
logistic <- glm(hd ~ sex, data=data, family="binomial") | |
summary(logistic) | |
## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 *** | |
## sexM 1.2737 0.2725 4.674 2.95e-06 *** | |
## Let's start by going through the first coefficient... | |
## (Intercept) -1.0438 0.2326 -4.488 7.18e-06 *** | |
## | |
## The intercept is the log(odds) a female will be unhealthy. This is because | |
## female is the first factor in "sex" (the factors are ordered, | |
## alphabetically by default,"female", "male") | |
female.log.odds <- log(25 / 71) | |
female.log.odds | |
## Now let's look at the second coefficient... | |
## sexM 1.2737 0.2725 4.674 2.95e-06 *** | |
## | |
## sexM is the log(odds ratio) that tells us that if a sample has sex=M, the | |
## odds of being unhealthy are, on a log scale, 1.27 times greater than if | |
## a sample has sex=F. | |
male.log.odds.ratio <- log((112 / 89) / (25/71)) | |
male.log.odds.ratio | |
## Now calculate the overall "Pseudo R-squared" and its p-value | |
## NOTE: Since we are doing logistic regression... | |
## Null devaince = 2*(0 - LogLikelihood(null model)) | |
## = -2*LogLikihood(null model) | |
## Residual deviacne = 2*(0 - LogLikelihood(proposed model)) | |
## = -2*LogLikelihood(proposed model) | |
ll.null <- logistic$null.deviance/-2 | |
ll.proposed <- logistic$deviance/-2 | |
## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null) | |
(ll.null - ll.proposed) / ll.null | |
## chi-square value = 2*(LL(Proposed) - LL(Null)) | |
## p-value = 1 - pchisq(chi-square value, df = 2-1) | |
1 - pchisq(2*(ll.proposed - ll.null), df=1) | |
1 - pchisq((logistic$null.deviance - logistic$deviance), df=1) | |
## Lastly, let's see what this logistic regression predicts, given | |
## that a patient is either female or male (and no other data about them). | |
predicted.data <- data.frame( | |
probability.of.hd=logistic$fitted.values, | |
sex=data$sex) | |
## We can plot the data... | |
ggplot(data=predicted.data, aes(x=sex, y=probability.of.hd)) + | |
geom_point(aes(color=sex), size=5) + | |
xlab("Sex") + | |
ylab("Predicted probability of getting heart disease") | |
## Since there are only two probabilities (one for females and one for males), | |
## we can use a table to summarize the predicted probabilities. | |
xtabs(~ probability.of.hd + sex, data=predicted.data) | |
##################################### | |
## | |
## Now we will use all of the data available to predict heart disease | |
## | |
##################################### | |
logistic <- glm(hd ~ ., data=data, family="binomial") | |
summary(logistic) | |
## Now calculate the overall "Pseudo R-squared" and its p-value | |
ll.null <- logistic$null.deviance/-2 | |
ll.proposed <- logistic$deviance/-2 | |
## McFadden's Pseudo R^2 = [ LL(Null) - LL(Proposed) ] / LL(Null) | |
(ll.null - ll.proposed) / ll.null | |
## The p-value for the R^2 | |
1 - pchisq(2*(ll.proposed - ll.null), df=(length(logistic$coefficients)-1)) | |
## now we can plot the data | |
predicted.data <- data.frame( | |
probability.of.hd=logistic$fitted.values, | |
hd=data$hd) | |
predicted.data <- predicted.data[ | |
order(predicted.data$probability.of.hd, decreasing=FALSE),] | |
predicted.data$rank <- 1:nrow(predicted.data) | |
## Lastly, we can plot the predicted probabilities for each sample having | |
## heart disease and color by whether or not they actually had heart disease | |
ggplot(data=predicted.data, aes(x=rank, y=probability.of.hd)) + | |
geom_point(aes(color=hd), alpha=1, shape=4, stroke=2) + | |
xlab("Index") + | |
ylab("Predicted probability of getting heart disease") | |
ggsave("heart_disease_probabilities.pdf") |