-
Notifications
You must be signed in to change notification settings - Fork 1
/
DataHelpers.R
94 lines (81 loc) · 3.63 KB
/
DataHelpers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Utility functions
weightedMeanOfPolls <- function(fullDataEntries, paramList){
x <- fullDataEntries$Vote
sigma2 <- rep(0, nrow(fullDataEntries))
for(i in 1:nrow(fullDataEntries)){
pollsterName <- as.character(fullDataEntries$Pollster[i])
x[i] <- x[i] - as.numeric(paramList[[pollsterName]][[as.character(fullDataEntries$Party[i])]])
sigma2[i] <- pollsterVarianceInElectorate(paramList[[pollsterName]][['NoiseVariance']],
as.character(fullDataEntries$Electorate[i]))
}
return( sum(x/sigma2)/sum(1/sigma2) )
}
varianceOfWeightedMean <- function(fullDataEntries, paramList){
sigma2 <- rep(0, nrow(fullDataEntries))
for(i in 1:nrow(fullDataEntries)){
sigma2[i] <- pollsterVarianceInElectorate(paramList[[as.character(fullDataEntries$Pollster[i])]][['NoiseVariance']],
as.character(fullDataEntries$Electorate[i]))
}
return( prod(sigma2)/sum(sigma2) )
}
# Rescale pollster's noise variance for each state such that taking an average
# of their state-by-state estimates will have the same variance as their reported
# national estimate
pollsterVarianceInElectorate <- function(australiaWideVariance, thisElectorate){
if(thisElectorate == "AUS"){
return(australiaWideVariance)
}
return(australiaWideVariance / popweights[[thisElectorate]])
}
# Low-level data operations
makeDataMatrixEntry <- function(fullDataEntries, paramList){
if(any(fullDataEntries$Pollster == 'Election')){
return(fullDataEntries$Vote[which(fullDataEntries$Pollster == 'Election')])
}
if(nrow(fullDataEntries)==1){
return(fullDataEntries$Vote - paramList[[as.character(fullDataEntries$Pollster[1])]][[fullDataEntries$Party[1]]])
}else{
return( weightedMeanOfPolls(fullDataEntries, paramList) )
}
}
makeH <- function(fullDataEntries, paramList){
if(any(fullDataEntries$Pollster == 'Election')){
return(0.045**2) # Elections only have rounding error
}
if(nrow(fullDataEntries)==1){
return(pollsterVarianceInElectorate(paramList[[as.character(fullDataEntries$Pollster[1])]][['NoiseVariance']],
as.character(fullDataEntries$Electorate[1])))
}else{
return( varianceOfWeightedMean(fullDataEntries, paramList) )
}
}
makeDataMatrixRowAndH <- function(fullDataRow, paramList){
columns <- unique(fullDataRow$ObservationColumn)
rowOutput <- matrix(NA, nrow=1, ncol=nrow(observationTypes))
diagH <- rep(NA, nrow(observationTypes))
for(column in columns){
relevantData <- fullDataRow[which(fullDataRow$ObservationColumn==column),]
rowOutput[1,column] <- makeDataMatrixEntry(relevantData, paramList)
diagH[column] <- makeH(relevantData, paramList)
}
return(list(Row = rowOutput, H = diag(diagH, nrow=nrow(observationTypes))))
}
# Given a data frame with multiple observations for particular weeks,
# this assembles them into single observations with time-varying uncertainty,
# conditional on the accuracy of individual pollsters.
makeDataMatrix <- function(fullData, paramList){
rowNumbers <- unique(fullData$RowNumber)
nObservationTypes <- nrow(observationTypes)
nObservations <- max(rowNumbers)
Y <- matrix(NA, nrow=nObservations, ncol=nObservationTypes)
H <- array( diag( 0, nrow=nObservationTypes, ncol=nObservationTypes ),
c(nObservationTypes, nObservationTypes, nObservations) )
for(rowI in seq_along(rowNumbers)){
thisRow <- rowNumbers[rowI]
dataChunk <- fullData[which(fullData$RowNumber == thisRow),]
yAndH <- makeDataMatrixRowAndH(dataChunk, paramList)
Y[thisRow,] <- yAndH[['Row']]
H[,,thisRow] <- yAndH[['H']]
}
return(list(Y=Y, H=H))
}