-
Notifications
You must be signed in to change notification settings - Fork 1
/
Extracting_living_taxa.R
executable file
·116 lines (97 loc) · 4.61 KB
/
Extracting_living_taxa.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#Extracting the living taxa from all the matrices
#Load functions
message("Loading/testing functions:", appendLF=FALSE)
source("functions.R")
load.functions(test=FALSE) #Set test=FALSE to speed up the loading
message("Done\n", appendLF=FALSE)
#Fixing binomial names
#system("sh ../Functions/fix_binomial.sh ../Data/Matrices_binomial/ ../Data/Matrices_binomial_change.csv")
#List of matrices
matrices_list<-list.files("../Data/Matrices_binomial/Matrices/")
#transform verbose warnings in read.nexus.data in messages
#Renaming the function (backup)
read.nex.dat<-read.nexus.data
#Substitute element 19 (stop message if wrong number of taxa)
body(read.nex.dat)[[19]]<-substitute(
if (tot.ntax != 0) {
message(paste("ntax:", ntax, "differ from actual number of taxa in file?", sep=" "), appendLF=FALSE)
stop("nexus parser did not read names correctly (tot.ntax!=0)")
}
)
#Substitute element 20 (stop message if wrong number of characters)
body(read.nex.dat)[[20]]<-substitute(
for (i in 1:length(Obj)) {
if (length(Obj[[i]]) != nchar) {
message(paste(names(Obj[i]), "has", length(Obj[[i]]), "characters", sep=" "), appendLF=FALSE)
stop("nchar differ from sequence length (length(Obj[[i]])!=nchar)")
}
}
)
#List of references
#Fritz tree
tree_list<-read.table("../Data/Taxon_References/Bininda-emonds_taxa.txt", header=F, stringsAsFactors=F)
#Wilson Reeder's Mammals Species of the World
WR_list<-read.csv("../Data/Taxon_References/WilsonReederMSW.csv", header=T, stringsAsFactors=F)
#Palaeobiology database
Paleo_list<-read.csv("../Data/Taxon_References/pdb_taxa.csv", header=T, stringsAsFactors=F)
#Combining the references
Reference_list<-list(tree_list, WR_list, Paleo_list)
names(Reference_list)<-c("Bininda-emonds Tree", "Wilson & Reeder", "Paleobio database")
#Building the empty table
extraction_table<-data.frame("Matrix"=NA,"Taxa"=NA, "Characters"=NA, "Living"=NA,"Source"=NA,"Tax.level"=NA)
#Build the table for each matrix
for(matrix in 1:length(matrices_list)){
#trial mode (continue if one matrix is bugged)
#be verbose!
message(paste(matrices_list[matrix], ":",sep=""), appendLF=FALSE)
#Try to load the matrix
current_matrix<-NULL
try(current_matrix<-read.nex.dat(paste("../Data/Matrices_binomial/Matrices/", matrices_list[matrix], sep="")), silent=TRUE)
if(!is.null(current_matrix)) {
#If successfully, proceed:
#Extracting the taxa
extraction_table_tmp<-extract.names(current_matrix, Reference_list, verbose=TRUE)
#Saving the matrix name
extraction_table_tmp$Matrix<-rep(matrices_list[matrix], length(current_matrix))
#Binding with the previous table
extraction_table<-rbind(extraction_table, extraction_table_tmp)
message("Done\n", appendLF=FALSE)
} else {
#Else, save the matrix name and continue
extraction_table_tmp<-data.frame("Matrix"=matrices_list[matrix],"Taxa"="FAILURE", "Characters"="FAILURE", "Living"="FAILURE","Source"="FAILURE","Tax.level"="FAILURE")
extraction_table<-rbind(extraction_table, extraction_table_tmp)
message("... FAILED\n", appendLF=FALSE)
}
}
#Removing the first (dummy) line of the extraction_table
extraction_table<-extraction_table[-1,]
#DOUBLE CHECK THE NAs
#Extract NAs
NA_species<-extraction_table[which(is.na(extraction_table$Living)),]
NA_taxa<-NA_species$Taxa
#Checking if the NAs are not in the tree
doubleCheck_tree<-match(NA_taxa, Reference_list[[1]])
doubleCheck_tree<-which(!is.na(doubleCheck_tree))
#Checking if the NAs are not in Wilson Reeder's list
doubleCheck_WR<-vector()
message("\nChecking:", appendLF=FALSE)
for(taxa in 1:length(NA_taxa)) {
doubleCheck_WR[taxa]<-check.NA(NA_taxa[taxa], Reference_list[[2]])
message(".", appendLF=FALSE)
}
message("Done.\n", appendLF=FALSE)
#If any taxa are not NA, replace them in the table by living==TRUE and tax.level
#For the Wilson Reeder
NA_species$Living[which(!is.na(doubleCheck_WR))]<-TRUE
NA_species$Source[which(!is.na(doubleCheck_WR))]<-"WR"
NA_species$Tax.level[which(!is.na(doubleCheck_WR))]<-doubleCheck_WR[which(!is.na(doubleCheck_WR))]
#For the tree
NA_species$Living[which(!is.na(doubleCheck_tree))]<-TRUE
NA_species$Source[which(!is.na(doubleCheck_tree))]<-"Bininda-emonds"
NA_species$Tax.level[which(!is.na(doubleCheck_tree))]<-"Species"
#Replace NA_species back in the table
extraction_table[which(is.na(extraction_table$Living)),]<-NA_species
#Save the results as .rda
save(extraction_table, file="../Data/List_of_matching_taxa/List_of_matching_taxa.Rda")
#Save the result as csv
write.csv(extraction_table, file="../Data/List_of_matching_taxa/List_of_matching_taxa.csv")