-
Notifications
You must be signed in to change notification settings - Fork 3
/
cqp.R
164 lines (152 loc) · 5.7 KB
/
cqp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#' Initialize Corpus Query Processor (CQP).
#'
#' CQP needs to know where to look for CWB indexed corpora. To initialize CQP,
#' call \code{cqp_initialize}. To reset the registry, use the function
#' \code{cqp_reset_registry}. To get the registry used by CQP, use
#' \code{cqp_get_registry}. To get the initialization status, use
#' \code{cqp_is_initialized}
#'
#' @param registry the registry directory
#' @export cqp_initialize
#' @rdname cqp_initialize
#' @author Andreas Blaette, Bernard Desgraupes, Sylvain Loiseau
#' @examples
#' cqp_is_initialized() # check initialization status
#' if (!cqp_is_initialized()) cqp_initialize()
#' cqp_is_initialized() # check initialization status (TRUE now?)
#' cqp_get_registry() # get registry dir used by CQP
#'
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' if (cqp_get_registry() != registry) cqp_reset_registry(registry = registry)
#' cqp_list_corpora() # get list of corpora
cqp_initialize <- function(registry = Sys.getenv("CORPUS_REGISTRY")){
registry_new <- registry
# registry # necessary to capture Sys.getenv() assignment
if (cqp_is_initialized()){
warning("CQP has already been initialized. Re-initialization is not possible. ",
"Only resetting registry.")
} else {
# workaround to ensure that global registry variable in dynamic
# library will have 255 characters. Without starting with a very long (fake)
# initial registry, there is a bug when resetting the registry dir to a dir
# that is longer than the initial dir
dummy_superdir <- tempdir()
dir.create(dummy_superdir, showWarnings = FALSE)
if (.Platform$OS.type == "windows"){
dummy_superdir <- normalizePath(dummy_superdir, winslash = "/")
}
# the times argument is 247 for Windows compatibility
dummy_regdir <- file.path(
dummy_superdir,
paste0(
rep("x", times = 246 - nchar(dummy_superdir)),
collapse = ""
)
)
dir.create(dummy_regdir, showWarnings = FALSE)
Sys.setenv(CORPUS_REGISTRY = dummy_regdir)
.init_cqp()
}
check_registry(registry_new)
Sys.setenv(CORPUS_REGISTRY = registry_new)
cqp_reset_registry()
return( cqp_is_initialized() )
}
#' @export cqp_is_initialized
#' @rdname cqp_initialize
cqp_is_initialized <- function(){
if (.cqp_get_status() == 0) return(FALSE) else return(TRUE)
}
#' @export cqp_get_registry
#' @rdname cqp_initialize
cqp_get_registry <- function() .cqp_get_registry()
#' @export cqp_reset_registry
#' @rdname cqp_initialize
cqp_reset_registry <- function(registry = Sys.getenv("CORPUS_REGISTRY")){
registry_dir <- registry
if (!cqp_is_initialized()){
warning("cannot reset registry, cqp has not yet been initialized!")
return( FALSE )
} else {
check_registry(registry_dir)
Sys.setenv(CORPUS_REGISTRY = registry_dir)
if (nchar(registry_dir) > 255){
stop("cannot assign new registry: maximum nchar(registry) is 255")
} else {
.cqp_set_registry(registry_dir = registry_dir)
return( TRUE )
}
}
}
#' List Available CWB Corpora.
#'
#' List the corpora described by the registry files in the registry directory
#' that is currently set.
#'
#' @export cqp_list_corpora
#' @examples
#' if (!cqp_is_initialized()){
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' cqp_initialize(registry)
#' }
#' cqp_list_corpora()
#' @author Andreas Blaette, Bernard Desgraupes, Sylvain Loiseau
cqp_list_corpora <- function() .cqp_list_corpora()
#' Execute CQP Query and Retrieve Results.
#'
#' Using CQP queries requires a two-step procedure: At first, you execute a
#' query using \code{cqp_query}. Then, \code{cqp_dump_subcorpus} will return a
#' matrix with the regions of the matches for the query.
#'
#' The \code{cqp_query} function executes a CQP query. The
#' \code{cqp_subcorpus_size} function returns the number of matches for the CQP
#' query. The \code{cqp_dump_subcorpus} function will return a two-column matrix
#' with the left and right corpus positions of the matches for the CQP query.
#'
#' @param corpus a CWB corpus
#' @param query a CQP query
#' @param subcorpus subcorpus name
#' @export cqp_query
#' @rdname cqp_query
#' @references
#' Evert, S. 2005. The CQP Query Language Tutorial. Available online at
#' \url{http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial.pdf}
#' @examples
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#'
#' if (!cqp_is_initialized()){
#' cqp_initialize(registry = registry)
#' } else {
#' if (cqp_get_registry() != registry) cqp_reset_registry(registry)
#' }
#' cqp_query(corpus = "REUTERS", query = '"oil";')
#' cqp_subcorpus_size("REUTERS")
#' cqp_dump_subcorpus("REUTERS")
#'
#' cqp_query(corpus = "REUTERS", query = '"crude" "oil";')
#' cqp_subcorpus_size("REUTERS", subcorpus = "QUERY")
#' cqp_dump_subcorpus("REUTERS")
#' @author Andreas Blaette, Bernard Desgraupes, Sylvain Loiseau
cqp_query <- function(corpus, query, subcorpus = "QUERY"){
stopifnot(corpus %in% cqp_list_corpora())
query <- check_cqp_query(query)
.cqp_query(corpus = corpus, subcorpus = subcorpus, query = query)
}
#' @export cqp_dump_subcorpus
#' @rdname cqp_query
cqp_dump_subcorpus <- function(corpus, subcorpus = "QUERY"){
stopifnot(corpus %in% cqp_list_corpora())
.cqp_dump_subcorpus(paste(corpus, subcorpus, sep = ":"))
}
#' @export cqp_subcorpus_size
#' @rdname cqp_query
cqp_subcorpus_size <- function(corpus, subcorpus = "QUERY"){
stopifnot(corpus %in% cqp_list_corpora())
.cqp_subcorpus_size(scorpus = paste(corpus, subcorpus, sep = ":"))
}
#' @export cqp_list_subcorpora
#' @rdname cqp_query
cqp_list_subcorpora <- function(corpus){
stopifnot(corpus %in% cqp_list_corpora())
.cqp_list_subcorpora(inCorpus = corpus)
}