-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding comp_Lib.R back in with my commenting analysis for #14
- Loading branch information
Showing
2 changed files
with
293 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,290 @@ | ||
# A library of functions | ||
library(dplyr,verbose = FALSE) | ||
|
||
#Creates a set of data-frames representing a graph of sequences, with the edges between those sequences representing the TN93 Distance. | ||
#Sequences must be dated with the date separated from the id by '_'. | ||
impTN93 <- function(iFile, minNS=63){ | ||
#@param iFile: The name/path of the input file (expecting tn93 output csv) | ||
#@param minNS: The minimum number of acceptible new Sequences. By default we keep this high. | ||
#@return: A list of 3 Data frames. An edge list (weighted by TN93 genetic distance), a vertex list, | ||
# and a list of minimum edges, for the future establishment of a timepoint-based model | ||
|
||
#Reading input file as a data frame. This will essentially act as an edgelist | ||
idf <- read.csv(iFile, stringsAsFactors = F) | ||
|
||
# accession | ||
temp1 <- sapply(idf$ID1, function(x) (strsplit(x,'_')[[1]])[[1]]) | ||
# collection date | ||
temp2 <- sapply(idf$ID1, function(x) (strsplit(x,'_')[[1]])[[2]]) | ||
|
||
temp3 <- sapply(idf$ID2, function(x) (strsplit(x,'_')[[1]])[[1]]) | ||
temp4 <- sapply(idf$ID2, function(x) (strsplit(x,'_')[[1]])[[2]]) | ||
|
||
#Create a data frame from the imported edge list. | ||
el <- data.frame(ID1=as.character(temp1), t1=year(as.Date(temp2)), | ||
ID2=as.character(temp3), t2=year(as.Date(temp4)), | ||
Distance = as.numeric(idf$Distance), stringsAsFactors= F) | ||
|
||
#Obtain the maximum time and time difference between the head and tail of each edge | ||
el$tMax <- pmax(el$t1, el$t2) | ||
el$tDiff <- abs(el$t1-el$t2) | ||
|
||
#Create a list of vertices based on the edge list. Original sequences with no edges will not be considered. | ||
vl <- unique(data.frame(ID = c(el$ID1, el$ID2), Time = c(el$t1, el$t2), stringsAsFactors=F)) | ||
|
||
#Order edges and vertices by time point and sort them into a single, larger list | ||
#If the newest timepoint contains a small number of sequences, we remove the newest year from consideration | ||
g <- list(v=vl[order(vl$Time),], e=el[order(el$tMax),], f=el[order(el$tMax),]) | ||
while(nrow(subset(g$v,Time==max(Time)))<=minNS) { | ||
keepT <- head(as.numeric(names(table(g$v$Time))),-1) | ||
g <- tFilt(g, keepT) | ||
} | ||
|
||
#Permanently remove edges from the new year such that only the closest edge between new vertices and old vertices remains | ||
#Obtain new vertices and remove any internal edges within the new vertices | ||
#We are not interested in completely new clustering | ||
nV <- subset(g$v, Time==max(Time)) | ||
|
||
#The subset of vertices excluding those at the oldest year | ||
subV <- subset(g$v, Time>min(Time)) | ||
|
||
#Obtain the closest retrospective edge of every vertex beyond the oldest year | ||
minE <- bind_rows(lapply(1:nrow(subV), function(i){ | ||
v <- subV[i,] | ||
incE <- subset(g$e, (ID1%in%v$ID)|(ID2%in%v$ID)) | ||
retE <- subset(incE, (tMax==v$Time)&(tDiff>0)) | ||
retE[which(retE$Distance==min(retE$Distance))[[1]],] | ||
})) | ||
|
||
#Only closest retrospective edges are kept for edges from new cases. | ||
g$e <- subset(g$e, tMax!=max(tMax)) | ||
g$e <- rbind(g$e, subset(minE, tMax==max(tMax))) | ||
g$f <- subset(minE, tMax < max(tMax)) | ||
|
||
return(g) | ||
} | ||
|
||
#Create clusters based on component clustering by some measure of genetic distance | ||
compClu <- function(iG) { | ||
#@param iG: The inputted graph. Expecting all vertices, but some edges filtered by distance. | ||
#@return: The inputted graph, annotated with a cluster size summary and case membership in the vertices section | ||
|
||
#Simplify the list of unsorted vertices (just id's) and edges (just head and tail id's) | ||
vid <- iG$v[,"ID"] | ||
adj <- iG$e[,c("ID1","ID2")] | ||
|
||
#Initialize the first cluster name and a column for cluster membership. c0 will be reserved for all singletons if sing=T | ||
iG$v$Cluster <- vector(mode="numeric", length=nrow(iG$v)) | ||
|
||
#The search vertex becomes the first member of the first cluster and is removed from the searchable set of cluster names | ||
i <- 1 | ||
srchV <- vid[1] | ||
memV <- srchV | ||
vid <- setdiff(vid, memV) | ||
|
||
#Assigning Cluster Membership | ||
repeat { | ||
|
||
#Remove edges internal to search query and list outgoing edges | ||
adj <- subset(adj, !(ID1%in%srchV & ID2%in%srchV)) | ||
exE <- subset(adj, ID1%in%srchV | ID2%in%srchV) | ||
|
||
#Find all neighbouring vertices to the search vertex (or search vertices) through external edges | ||
#These are then added to the list of member vertices and removed from the list of searchable vertices | ||
nbV <- setdiff(c(exE$ID1,exE$ID2), srchV) | ||
memV <- c(memV, nbV) | ||
vid <- setdiff(vid, nbV) | ||
|
||
#If there are no more neigbours to the search vertices, the cluster is completed and we reset the search parameters | ||
if (length(nbV)==0) { | ||
|
||
iG$v$Cluster[iG$v$ID%in%memV] <- i | ||
|
||
#The end condition, catching the event that there are no vertices to assign to clusters | ||
if (length(vid)==0) {break} | ||
|
||
#Reset search parameters | ||
i <- i+1 | ||
srchV <- vid[1] | ||
memV <- srchV | ||
vid <- setdiff(vid, memV) | ||
|
||
next | ||
} | ||
|
||
#Remove all edges within the current cluster from the adjacency list | ||
adj <- subset(adj, !(ID1%in%srchV | ID2%in%srchV)) | ||
srchV <- nbV | ||
} | ||
|
||
#Add some summary information regarding clusters | ||
iG$c <- table(iG$v$Cluster) | ||
|
||
return(iG) | ||
} | ||
|
||
#A simple function, removing edges that sit above a maximum reporting distance (@param:maxD). | ||
dFilt <- function(iG, maxD) { | ||
iG$e <- subset(iG$e, Distance<=maxD) | ||
return(iG) | ||
} | ||
|
||
#A simple function, removing vertices that sit above a maximum time point (@param: maxT) | ||
tFilt <- function(iG, keepT) { | ||
iG$v <- subset(iG$v, Time%in%keepT) | ||
iG$e <- subset(iG$e, tMax%in%keepT) | ||
return(iG) | ||
} | ||
|
||
#Simulate the growth of clusters, showing the difference in cluster size between the newest and the penultimate time point | ||
#The frame of reference for clusters is the penultimate year, simulating one making forcasting decisions based on one time point and validating them with the next | ||
simGrow <- function(iG) { | ||
#@param: The inputted graph. Expecting all vertices, but some edges filtered by distance. | ||
#@return: The same cluster annotated with the actual growth and cluster information | ||
|
||
#Obtain clusters at the new time point, after removing singletons | ||
nG <- iG | ||
|
||
# subset of new nodes that have no edges to known cases | ||
nSing <- subset(nG$v, (!ID%in%c(nG$e$ID1,nG$e$ID2) & Time==max(Time))) | ||
|
||
# subset of new nodes that DO have edges to known cases, AND all known cases | ||
nG$v <- subset(nG$v, !(!ID%in%c(nG$e$ID1,nG$e$ID2) & Time==max(Time))) | ||
nG <- compClu(nG) # assign nodes to connected components | ||
|
||
#obtain clusters at an old time point | ||
keepT <- head(as.numeric(names(table(iG$v$Time))),-1) | ||
oG <- compClu(tFilt(iG, keepT)) | ||
|
||
#Define growth as the difference in cluster size between new and old graphs | ||
#After clsFilter(), nG will have the same number of clusters as oG, and similar membership | ||
iG$g <- nG$c-oG$c | ||
iG$c <- oG$c | ||
|
||
#Re-Add the singletons, citing new singletons as members of the cluster 0 | ||
if (nrow(nSing)>0){ | ||
nSing$Cluster <- 0 | ||
} | ||
iG$v <- rbind(nG$v, nSing) | ||
|
||
return(iG) | ||
} | ||
|
||
#Obtains some likelihood data in order to weight cases based on their recency | ||
likData <- function(iG) { | ||
#@param iG: The inputted graph. Expecting the entire Graph with new year included. | ||
#@return: A data frame of "Positives" (related cases) between one time point and another, annotated with the number of possible total positives. | ||
# Each positive also carries it's Genetic Distance measurement and time point difference (between time points) | ||
|
||
#Take in total graph without the newest time point (otherwise, we include the validation set in or data set) | ||
keepT <- tail(head(as.numeric(names(table(iG$v$Time))),-1),-1) | ||
subG <- tFilt(iG, keepT) | ||
|
||
#Obtain the closest retrospective edge of every vertex | ||
f <- bind_rows(lapply(2:nrow(subG$v), function(i){ | ||
|
||
v <- subG$v[i,] | ||
|
||
incE <- subset(iG$e, (ID1%in%v$ID)|(ID2%in%v$ID)) | ||
retE <- subset(incE, (tMax==v$Time)&(tDiff>0)) | ||
minE <- retE[which(retE$Distance==min(retE$Distance))[[1]],] | ||
df <- data.frame(Distance=minE$Distance, tMax=minE$tMax, tDiff=minE$tDiff, vTotal=nrow(subset(iG$v, Time==v$Time))) | ||
|
||
return(df) | ||
|
||
})) | ||
|
||
return(f) | ||
} | ||
|
||
#Analyze a given Clustered Graph to establish the difference between the performance of two different models | ||
#Performance is defined as the ability for cluster growth to fit a predictive model. | ||
compAnalyze <- function(subG) { | ||
#@param subG: A subGraph cut based on a threshold distance, expecting a member of the multiGraph set | ||
#@return: A graph annotated with growth, cluster info and level of predictive performance (measured through GAIC) | ||
|
||
#Obtain some summarized information from the sub-Graph | ||
dMax <- max(subG$e$Distance) # e is edge list filtered by threshold | ||
|
||
# f is unfiltered edge list, excluding edges to new cases and between cases from time | ||
tTab <- table(subG$f$tMax) # table of sampling times of most recent node for every edge in $f | ||
|
||
# v is node list derived from unfiltered edge list | ||
vTab <- table(subG$v$Time) # table of sampling times, including most recent nodes | ||
eTab <- sapply(as.numeric(names(vTab)), function(t){ | ||
nrow(subset(subG$e, (t1==t|t2==t))) # how many filtered edges associated with each sampling time? | ||
}) | ||
names(eTab) <- names(vTab) | ||
|
||
#Take the total edge frequency data from the graph and format this information into successes and attempts | ||
#An edge to the newest year falling below the max distance is considered a success | ||
ageD <- bind_rows(lapply(as.numeric(names(tTab)), function(t) { | ||
# for every sampling time (not including earliest) | ||
temp <- subset(subG$f, tMax==t) # get associated bipartite edges | ||
dfs <- split(temp, temp$tDiff) # split by start times | ||
|
||
# how many edges in bipartite graph meet distance threshold? | ||
Positive <- sapply(dfs, function(df){length(which(df$Distance<=dMax))}) | ||
vTotal <- rep((vTab[[as.character(t)]]),length(dfs)) # FIXME: unused? | ||
tDiff <- as.numeric(names(Positive)) | ||
oeDens <- sapply(tDiff, function(tD){ | ||
# to calculate the average degree (out-edge density) of known cases from the same time point | ||
oTime <- t-tD # start time for this bipartite graph | ||
|
||
# return the number of filtered edges with a node from time oTime | ||
# divide by total number of nodes from oTime | ||
return(eTab[as.character(oTime)]/vTab[as.character(oTime)]) | ||
}) | ||
|
||
res <- data.frame(Positive=as.numeric(Positive), vTotal=vTab[[as.character(t)]], oeDens=oeDens, tDiff) | ||
return(res) | ||
})) | ||
|
||
#Obtain a model of case connection frequency to new cases as predicted by individual case age | ||
#Use this to weight cases by age | ||
mod <- glm(cbind(Positive, vTotal) ~ tDiff+oeDens, data=ageD, family='binomial') | ||
subG$v$Weight <- predict(mod, type='response', | ||
data.frame(tDiff=max(subG$v$Time)-subG$v$Time, | ||
oeDens=as.numeric(eTab[as.character(subG$v$Time)]/vTab[as.character(subG$v$Time)]))) | ||
# subG$v$Weight <- sapply(subG$v$ID, function(id) {as.numeric(substr(id,8,8))}) | ||
|
||
#Create clusters for this subgraph and measure growth | ||
subG <- simGrow(subG) | ||
cPred <- subset(subG$v, Time<max(Time))[,c("Weight", "Cluster")] | ||
|
||
#Create two data frames from two predictive models, one based on absolute size (NULL) and our date-informed model | ||
df1 <- data.frame(Growth = as.numeric(subG$g), Pred = sapply(names(subG$c), function(x) { sum(subset(cPred, Cluster==as.numeric(x))$Weight) })) | ||
df2 <- data.frame(Growth = as.numeric(subG$g), Pred = as.numeric(subG$c) * (sum(as.numeric(subG$g))/sum(as.numeric(subG$c)))) | ||
fit1 <- glm(Growth ~ Pred, data = df1, family = "poisson") | ||
fit2 <- glm(Growth ~ Pred, data = df2, family = "poisson") | ||
|
||
#Save, gaic, model and age data as part of the output | ||
subG$gaic <- fit1$aic-fit2$aic | ||
subG$ageMod <- mod | ||
subG$ageFit <- fit1 | ||
subG$nullFit <- fit2 | ||
subG$f <- ageD | ||
|
||
return(subG) | ||
} | ||
|
||
#Run across a set of several subGraphs created at various filters, analyzing GAIC at each with clusterAnalyze | ||
gaicRun <- function(iG, cutoffs=NA) { | ||
#@param iG: Expecting the entire Graph, but in some cases may take a subset | ||
#@return: A data frame of each runs cluster information (clusterAnalyze output) | ||
|
||
#Initialize a set of cutoffs to observe (based on the genetic distance distribution) | ||
if (is.na(cutoffs)) { | ||
steps <- head(hist(subset(iG$e, Distance<0.05)$Distance, plot=FALSE)$breaks, -5) | ||
cutoffs <- seq(0 , max(steps), max(steps)/50) | ||
} | ||
|
||
#A set of several graphs created at different cutoffs | ||
gs <- lapply(cutoffs, function(d) {dFilt(iG, d)}) | ||
|
||
#Generate cluster data for each subGraph in gs | ||
res <- lapply(gs, function(subG) {compAnalyze(subG)}) | ||
names(res) <- cutoffs | ||
|
||
return(res) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters