-
Notifications
You must be signed in to change notification settings - Fork 134
/
semiconductor.R
152 lines (123 loc) · 4.67 KB
/
semiconductor.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
SC <- read.csv("semiconductor.csv")
## full model
full <- glm(FAIL ~ ., data=SC, family=binomial)
1 - full$deviance/full$null.deviance
## grab p-values
pvals <- summary(full)$coef[-1,4] #-1 to drop the intercept
## plot them: it looks like we have some signal here
hist(pvals, xlab="p-value", main="", col="lightblue")
## At 10% FDR, we get 25 `signif'
fdr_cut <- function(pvals, q=0.1){
pvals <- sort(pvals[!is.na(pvals)])
N <- length(pvals)
k <- rank(pvals, ties.method="min")
alpha <- max(pvals[ pvals<= (q*k/(N+1)) ])
plot(pvals, log="xy", xlab="order", main=sprintf("FDR of %g",q),
ylab="p-value", bty="n", col=c(8,2)[(pvals<=alpha) + 1], pch=20)
lines(1:N, q*(1:N)/(N+1))
return(alpha)
}
fdr_cut(pvals)
## Re-run a cut regression using only these 25
( signif <- which(pvals <= 0.0122) )
cut <- glm(FAIL ~ ., data=SC[,c("FAIL", names(signif))], family="binomial")
1 - cut$deviance/cut$null.deviance # new in-sample R2
## Out of sample prediction experiment
## first, define the deviance and R2 functions
## pred must be probabilities (0<pred<1) for binomial
deviance <- function(y, pred, family=c("gaussian","binomial")){
family <- match.arg(family)
if(family=="gaussian"){
return( sum( (y-pred)^2 ) )
}else{
if(is.factor(y)) y <- as.numeric(y)>1
return( -2*sum( y*log(pred) + (1-y)*log(1-pred) ) )
}
}
## get null devaince too, and return R2
R2 <- function(y, pred, family=c("gaussian","binomial")){
fam <- match.arg(family)
if(fam=="binomial"){
if(is.factor(y)){ y <- as.numeric(y)>1 }
}
dev <- deviance(y, pred, family=fam)
dev0 <- deviance(y, mean(y), family=fam)
return(1-dev/dev0)
}
# setup the experiment
n <- nrow(SC) # the number of observations
K <- 10 # the number of `folds'
# create a vector of fold memberships (random order)
foldid <- rep(1:K,each=ceiling(n/K))[sample(1:n)]
# create an empty dataframe of results
OOS <- data.frame(full=rep(NA,K), cut=rep(NA,K))
# use a for loop to run the experiment
for(k in 1:K){
train <- which(foldid!=k) # train on all but fold `k'
## fit the two regressions
rfull <- glm(FAIL~., data=SC, subset=train, family=binomial)
rcut <- glm(FAIL~., data=SC[,cutvar], subset=train, family=binomial)
## get predictions: type=response so we have probabilities
predfull <- predict(rfull, newdata=SC[-train,], type="response")
predcut <- predict(rcut, newdata=SC[-train,], type="response")
## calculate and log R2
OOS$full[k] <- R2(y=SC$FAIL[-train], pred=predfull, family="binomial")
OOS$cut[k] <- R2(y=SC$FAIL[-train], pred=predcut, family="binomial")
## print progress
cat(k, " ")
}
## plot it in plum
par(mai=c(.9,.9,.1,.1))
boxplot(OOS, col="plum", ylab="R2", xlab="model", bty="n")
## what are the average OOS R2?
colMeans(OOS) # WOW! Full model really sucks.
## A forward stepwise procedure
# null model
null <- glm(FAIL~1, data=SC)
# forward stepwise: it takes a long time!
system.time(fwd <- step(null, scope=formula(full), dir="forward"))
length(coef(fwd)) # chooses around 70 coef
#### lasso (glmnet does L1-L2, gamlr does L0-L1)
library(gamlr)
# for gamlr, and most other functions, you need to create your own numeric
# design matrix. We'll do this as a sparse `simple triplet matrix' using
# the sparse.model.matrix function.
scx <- sparse.model.matrix(FAIL ~ ., data=SC)[,-1] # do -1 to drop intercept!
# here, we could have also just done x <- as.matrix(SC[,-1]).
# but sparse.model.matrix is a good way of doing things if you have factors.
scy <- SC$FAIL # pull out `y' too just for convenience
# fit a single lasso
sclasso <- gamlr(scx, scy, family="binomial")
plot(sclasso) # the ubiquitous path plot
# AICc selected coef
scbeta <- coef(sclasso)
log(sclasso$lambda[which.min(AICc(sclasso))])
sum(scbeta!=0) # chooses 30 (+intercept) @ log(lambda) = -4.5
# alt: BIC selected coef
BICseg <- which.min(BIC(sclasso))
scb.bic <- coef(sclasso, s=BICseg)
sum(scb.bic!=0) # sets all coef to zero: just the intercept!
## cross validated lasso (verb just prints progress)
sccvl <- cv.gamlr(scx, scy, family="binomial", verb=TRUE)
plot(sccvl, bty="n")
## CV min deviance selection
scb.min <- coef(sccvl, select="min")
log(sccvl$lambda.min)
sum(scb.min!=0) ## around 65-70 with log(lam) -4.8 (its random!)
## CV 1se selection (the default)
scb.1se <- coef(sccvl)
log(sccvl$lambda.1se)
sum(scb.1se!=0) ## usually selects all zeros (just the intercept)
## comparing AICc, BIC, and the CV error
plot(sccvl, bty="n")
lines(log(sclasso$lambda),AICc(sclasso)/n, col="green", lwd=2)
lines(log(sclasso$lambda),BIC(sclasso)/n, col="maroon", lwd=2)
legend("top", fill=c("blue","green","maroon"),
legend=c("CV","AICc","BIC"), bty="n")
## BIC, AIC, AICc on full vs cut (they all prefer cut)
BIC(full)
BIC(cut)
AIC(full)
AIC(cut)
AICc(full)
AICc(cut)