Skip to content

Commit

Permalink
dev branch merged into master
Browse files Browse the repository at this point in the history
  • Loading branch information
ablaette committed Jun 16, 2018
2 parents 029ce3a + 199ebe6 commit bb1b4e1
Show file tree
Hide file tree
Showing 614 changed files with 80,962 additions and 75,489 deletions.
5 changes: 4 additions & 1 deletion .travis.yml
Expand Up @@ -7,18 +7,21 @@ matrix:
dist: trusty
- os: osx
osx_image: xcode9.1
brew_packages: pkg-config glib pcre

addons:
apt:
packages:
- libpcre3-dev
- libglib2.0-dev

r_packages:
- Rcpp
- covr
- digest
- testthat

warnings_are_errors: false
warnings_are_errors: true

notifications:
email:
Expand Down
6 changes: 4 additions & 2 deletions DESCRIPTION
@@ -1,8 +1,8 @@
Package: RcppCWB
Type: Package
Title: 'Rcpp' Bindings for the 'Corpus Workbench' ('CWB')
Version: 0.2.3
Date: 2018-05-13
Version: 0.2.4
Date: 2018-06-15
Authors@R: c(
person("Andreas", "Blaette", role = c("aut", "cre"), email = "andreas.blaette@uni-due.de"),
person("Bernard", "Desgraupes", role = "aut"),
Expand Down Expand Up @@ -45,11 +45,13 @@ Collate:
'RcppCWB_package.R'
'cl.R'
'cqp.R'
'cwb.R'
'checks.R'
'count.R'
'RcppExports.R'
'decode.R'
'cbow.R'
'region_matrix.R'
'misc.R'
'zzz.R'

6 changes: 6 additions & 0 deletions NAMESPACE
Expand Up @@ -5,6 +5,7 @@ export(check_cpos)
export(check_cqp_query)
export(check_id)
export(check_p_attribute)
export(check_pkg_registry_files)
export(check_region_matrix)
export(check_registry)
export(check_s_attribute)
Expand All @@ -18,12 +19,17 @@ export(cqp_list_subcorpora)
export(cqp_query)
export(cqp_reset_registry)
export(cqp_subcorpus_size)
export(cwb_compress_rdx)
export(cwb_huffcode)
export(cwb_makeall)
export(get_cbow_matrix)
export(get_count_vector)
export(get_pkg_registry)
export(get_region_matrix)
export(region_matrix_to_count_matrix)
export(region_matrix_to_ids)
export(s_attribute_decode)
export(use_tmp_registry)
exportPattern("^[[:alpha:]]+")
importFrom(Rcpp,evalCpp)
useDynLib(RcppCWB, .registration = TRUE)
8 changes: 8 additions & 0 deletions NEWS.md
@@ -1,3 +1,11 @@
# v0.2.4
* for linux and macOS, CWB 3.4.14 included, so that UTF-8 support is realized
* bug removed in check_cqp_query that would prevent special characters from working
in CQP queries
* check_strucs, check_cpos and check_id are checking for NAs now to avoid crashes
* cwb command line tools cwb-makeall, cwb-huffcode and cwb-compress-rdx exposed
as cwb_makeall, cwb_huffcode and cwb_compress_rdx

# v0.2.3
* when loading the package, a check is performed to make sure that paths in the
registry files point to the data files of the sample data (issues may occur when
Expand Down
8 changes: 4 additions & 4 deletions R/RcppCWB_package.R
Expand Up @@ -118,15 +118,15 @@
#' # functions of the corpus library (starting with cl) expose the low-level
#' # access to the CWB corpus library (CL)
#'
#' regdir <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' ids <- cl_cpos2id("REUTERS", cpos = 1:20, p_attribute = "word", registry = regdir)
#' tokens <- cl_id2str("REUTERS", id = ids, p_attribute = "word", registry = regdir)
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' ids <- cl_cpos2id("REUTERS", cpos = 1:20, p_attribute = "word", registry = registry)
#' tokens <- cl_id2str("REUTERS", id = ids, p_attribute = "word", registry = registry)
#' print(paste(tokens, collapse = " "))
#'
#' # To use the corpus query processor (CQP) and its syntax, it is necessary first
#' # to initialize CQP (example: get concordances of 'oil')
#'
#' cqp_initialize(regdir)
#' cqp_initialize(registry)
#' cqp_query("REUTERS", query = '[]{5} "oil" []{5}')
#' cpos_matrix <- cqp_dump_subcorpus("REUTERS")
#' concordances_oil <- apply(
Expand Down
12 changes: 12 additions & 0 deletions R/RcppExports.R
Expand Up @@ -125,3 +125,15 @@
.Call(`_RcppCWB_region_matrix_to_count_matrix`, corpus, p_attribute, registry, matrix)
}

.cwb_makeall <- function(x, registry_dir, p_attribute) {
.Call(`_RcppCWB_cwb_makeall`, x, registry_dir, p_attribute)
}

.cwb_huffcode <- function(x, registry_dir, p_attribute) {
.Call(`_RcppCWB_cwb_huffcode`, x, registry_dir, p_attribute)
}

.cwb_compress_rdx <- function(x, registry_dir, p_attribute) {
.Call(`_RcppCWB_cwb_compress_rdx`, x, registry_dir, p_attribute)
}

3 changes: 2 additions & 1 deletion R/cbow.R
Expand Up @@ -11,7 +11,8 @@
#' @rdname get_cbow_matrix
#' @export get_cbow_matrix
#' @examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#'
#' m <- get_region_matrix(
#' corpus = "REUTERS", s_attribute = "places",
#' strucs = 0L:5L, registry = registry
Expand Down
79 changes: 69 additions & 10 deletions R/checks.R
Expand Up @@ -79,6 +79,8 @@ check_strucs <- function(corpus, s_attribute, strucs, registry){
stop("all values of vector strucs need to be >= 0")
if (max(strucs) > (cl_attribute_size(corpus, attribute = s_attribute, "s", registry = registry) - 1))
stop("highest value of strucs may not be larger than size of structural attribute")
if (any(is.na(strucs)))
stop("there is an NA value among strucs")
return( TRUE )
}

Expand All @@ -94,8 +96,12 @@ check_region_matrix <- function(region_matrix){
#' @export check_cqp_query
#' @rdname checks
check_cqp_query <- function(query){
if (!substr(query, start = length(query), stop = length(query)) == ";"){
return( paste0(query, ";", sep = "") )
if (!substr(query, start = nchar(query), stop = nchar(query)) == ";"){
encoding_query <- Encoding(query)
retval <- paste0(query, ";", sep = "")
if (Encoding(retval) != encoding_query)
retval <- iconv(retval, from = Encoding(retval), to = encoding_query)
return( retval )
} else {
return( query )
}
Expand All @@ -105,24 +111,77 @@ check_cqp_query <- function(query){
#' @rdname checks
check_cpos <- function(corpus, p_attribute = "word", cpos, registry = Sys.getenv("CORPUS_REGISTRY")){
attr_max <- cl_attribute_size(corpus = corpus, attribute = p_attribute, attribute_type = "p", registry = registry)
if (min(cpos) < 0){
if (min(cpos) < 0)
stop("all corpus positions (cpos) need to be >= 0, not TRUE")
}
if (max(cpos) > attr_max){
if (max(cpos) > attr_max)
stop("all corpus positions (cpos) need to be <= attribute size, not TRUE")
}
if (any(is.na(cpos)))
stop("there are NA values among the corpus positions")
return( TRUE )
}

#' @export check_id
#' @rdname checks
check_id <- function(corpus, p_attribute, id, registry = Sys.getenv("CORPUS_REGISTRY")){
lexicon_size <- cl_lexicon_size(corpus = corpus, p_attribute = p_attribute, registry = registry)
if (min(id) < 0){
if (min(id) < 0)
stop("all corpus positions (cpos) need to be >= 0, not TRUE")
}
if (max(id) > lexicon_size){
if (max(id) > lexicon_size)
stop("all corpus positions (cpos) need to be <= attribute size, not TRUE")
}
if (any(is.na(id)))
stop("there are NA values among the corpus positions")
return( TRUE )
}

#' Check Paths in Registry Files
#'
#' @param pkg Full path to package directory
#' @param set Logical, whether
#' @return Logical value, whether home directories are set correctly.
#' @export check_pkg_registry_files
check_pkg_registry_files <- function(pkg = system.file(package = "RcppCWB"), set = FALSE){
pkg_cwb_dir <- file.path(pkg, "extdata", "cwb")
pkg_registry_dir <- file.path(pkg_cwb_dir, "registry")
pkg_indexed_corpora_dir <- file.path(pkg_cwb_dir, "indexed_corpora")
is_set <- lapply(
list.files(pkg_registry_dir),
function(corpus){
registry_file <- file.path(pkg_registry_dir, corpus)
registry <- readLines(registry_file)
home_line_no <- grep("^HOME", registry)
info_line_no <- grep("^INFO", registry)
registry_home_dir <- gsub("^HOME\\s+\"*(.*?)\"*\\s*$", "\\1", registry[home_line_no])
registry_info_file <- gsub("^INFO\\s+\"*(.*?)\"*\\s*$", "\\1", registry[info_line_no])
pkg_home_dir <- file.path(pkg_indexed_corpora_dir, corpus)
if (!identical(x = registry_home_dir, y = pkg_home_dir)) {
if (set){
message(sprintf("... adjusting data directory in registry for corpus '%s' in package '%s'", corpus, pkg))
info_file_new <- file.path(pkg_home_dir, basename(registry_info_file), fsep = "/")
if (.Platform$OS.type == "windows") {
registry[home_line_no] <- sprintf("HOME \"%s\"", pkg_home_dir)
registry[info_line_no] <- sprintf("INFO \"%s\"", info_file_new)
} else {
if (grepl("\\s+", pkg_home_dir)) {
registry[grep("^HOME", registry)] <- sprintf("HOME \"%s\"", pkg_home_dir)
registry[info_line_no] <- sprintf("INFO \"%s\"", info_file_new)
} else {
registry[grep("^HOME", registry)] <- sprintf("HOME %s", pkg_home_dir)
registry[info_line_no] <- sprintf("INFO %s", info_file_new)
}
}
if (file.access(registry_file, mode = 2) == -1){
warning(sprintf("Not sufficient permissions to modify registry file %s", registry_file),
" which would be necessary to have access to sample corpora in package. ",
"Consider loading package with admin rights one."
)
}
writeLines(text = registry, con = registry_file, sep = "\n")
return(TRUE)
} else {
return(FALSE)
}
}
}
)
all(unlist(is_set))
}
10 changes: 5 additions & 5 deletions R/cl.R
Expand Up @@ -12,7 +12,8 @@
#' @param registry path to the registry directory, defaults to the value of the
#' environment variable CORPUS_REGISTRY
#' @examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#'
#' Sys.setenv(CORPUS_REGISTRY = registry)
#' token_no <- cl_attribute_size("REUTERS", attribute = "word", attribute_type = "p")
#' corpus_positions <- seq.int(from = 0, to = token_no - 1)
Expand All @@ -37,7 +38,7 @@ cl_attribute_size <- function(corpus, attribute, attribute_type, registry = Sys.
#' environment variable CORPUS_REGISTRY
#' @rdname cl_lexicon_size
#' @examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' Sys.setenv(CORPUS_REGISTRY = registry)
#' lexicon_size <- cl_lexicon_size("REUTERS", p_attribute = "word")
#' token_ids <- seq.int(from = 0, to = lexicon_size - 1)
Expand Down Expand Up @@ -65,8 +66,7 @@ cl_lexicon_size <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_R
#' @rdname s_attributes
#' @name CL: s_attributes
#' @examples
#' # get registry directory
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#'
#' # get metadata for matches of token
#' # scenario: id of the texts with occurrence of 'oil'
Expand Down Expand Up @@ -157,7 +157,7 @@ cl_cpos2rbound <- function(corpus, s_attribute, cpos, registry = Sys.getenv("COR
#' @name CL: p_attributes
#' @examples
#' # registry directory and cpos_total will be needed in examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' Sys.setenv(CORPUS_REGISTRY = registry)
#' cpos_total <- cl_attribute_size(
#' corpus = "REUTERS", attribute = "word",
Expand Down
2 changes: 1 addition & 1 deletion R/count.R
Expand Up @@ -11,7 +11,7 @@
#' @rdname get_count_vector
#' @export get_count_vector
#' @examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' y <- get_count_vector(
#' corpus = "REUTERS", p_attribute = "word",
#' registry = registry
Expand Down
7 changes: 4 additions & 3 deletions R/cqp.R
Expand Up @@ -15,8 +15,9 @@
#' if (!cqp_is_initialized()) cqp_initialize()
#' cqp_is_initialized() # check initialization status (TRUE now?)
#' cqp_get_registry() # get registry dir used by CQP
#' regdir <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' if (cqp_get_registry() != regdir) cqp_reset_registry(registry = regdir)
#'
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' if (cqp_get_registry() != registry) cqp_reset_registry(registry = registry)
#' cqp_list_corpora() # get list of corpora
cqp_initialize <- function(registry = Sys.getenv("CORPUS_REGISTRY")){
registry_new <- registry
Expand Down Expand Up @@ -119,7 +120,7 @@ cqp_list_corpora <- function() .cqp_list_corpora()
#' Evert, S. 2005. The CQP Query Language Tutorial. Available online at
#' \url{http://cwb.sourceforge.net/files/CWB_Encoding_Tutorial.pdf}
#' @examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#'
#' if (!cqp_is_initialized()){
#' cqp_initialize(registry = registry)
Expand Down
79 changes: 79 additions & 0 deletions R/cwb.R
@@ -0,0 +1,79 @@
#' CWB Tools for Creating Corpora
#'
#' Wrappers for the CWB tools (\code{cwb-makeall}, \code{cwb-huffcode},
#' \code{cwb-compress-rdx}). Unlike the 'original' command line tools, these
#' wrappers will always perform a specific indexing/compression step on one
#' positional attribute, and produce all components.
#'
#' @rdname cwb_utils
#' @param corpus name of a CWB corpus (upper case)
#' @param p_attribute name p-attribute
#' @param registry path to the registry directory, defaults to the value of the
#' environment variable CORPUS_REGISTRY
#' @examples
#' # The package includes and 'unfinished' corpus of debates in the UN General
#' # Assembly ("UNGA"), i.e. it does not yet include the reverse index, and it is
#' # not compressed.
#' #
#' # The first step in the following example is to copy the raw
#' # corpus to a temporary place.
#'
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' home_dir <- system.file(package = "RcppCWB", "extdata", "cwb", "indexed_corpora", "unga")
#'
#' tmpdir <- tempdir()
#' win <- if (Sys.info()[["sysname"]] == "Windows") TRUE else FALSE
#' if (win) tmpdir <- normalizePath(tmpdir)
#' tmp_regdir <- file.path(tmpdir, "registry", fsep = if (win) "\\" else "/")
#' tmp_data_dir <- file.path(tmpdir, "indexed_corpora", fsep = if (win) "\\" else "/")
#' tmp_unga_dir <- file.path(tmp_data_dir, "unga", fsep = if (win) "\\" else "/")
#' if (!file.exists(tmp_regdir)) dir.create(tmp_regdir)
#' if (!file.exists(tmp_data_dir)) dir.create(tmp_data_dir)
#' if (!file.exists(tmp_unga_dir)){
#' dir.create(tmp_unga_dir)
#' } else {
#' file.remove(list.files(tmp_unga_dir, full.names = TRUE))
#' }
#' regfile <- readLines(file.path(registry, "unga"))
#' homedir_line <- grep("^HOME", regfile)
#' regfile[homedir_line] <- sprintf('HOME "%s"', tmp_unga_dir)
#' writeLines(text = regfile, con = file.path(tmp_regdir, "unga"))
#' for (x in list.files(home_dir, full.names = TRUE)){
#' file.copy(from = x, to = tmp_unga_dir)
#' }
#'
#' # perform cwb_makeall (equivalent to cwb-makeall command line utility)
#' cwb_makeall(corpus = "UNGA", p_attribute = "word", registry = tmp_regdir)
#'
#' # see whether it works
#' ids_sentence_1 <- cl_cpos2id(
#' corpus = "UNGA", p_attribute = "word", registry = tmp_regdir,
#' cpos = 0:83
#' )
#' tokens_sentence_1 <- cl_id2str(
#' corpus = "UNGA", p_attribute = "word",
#' registry = tmp_regdir, id = ids_sentence_1
#' )
#' sentence <- gsub("\\s+([\\.,])", "\\1", paste(tokens_sentence_1, collapse = " "))
#' @rdname cwb_utils
#' @export cwb_makeall
cwb_makeall <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){
.cwb_makeall(x = corpus, p_attribute = p_attribute, registry_dir = registry)
}


#' @rdname cwb_utils
#' @export cwb_huffcode
#' @examples
#' cwb_huffcode(corpus = "UNGA", p_attribute = "word", registry = tmp_regdir)
cwb_huffcode <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){
.cwb_huffcode(x = corpus, p_attribute = p_attribute, registry_dir = registry)
}

#' @rdname cwb_utils
#' @export cwb_compress_rdx
#' @examples
#' cwb_compress_rdx(corpus = "UNGA", p_attribute = "word", registry = tmp_regdir)
cwb_compress_rdx <- function(corpus, p_attribute, registry = Sys.getenv("CORPUS_REGISTRY")){
.cwb_compress_rdx(x = corpus, p_attribute = p_attribute, registry_dir = registry)
}
2 changes: 1 addition & 1 deletion R/decode.R
Expand Up @@ -20,7 +20,7 @@
#' @rdname s_attribute_decode
#' @return a character vector
#' @examples
#' registry <- system.file(package = "RcppCWB", "extdata", "cwb", "registry")
#' registry <- if (!check_pkg_registry_files()) use_tmp_registry() else get_pkg_registry()
#' Sys.setenv(CORPUS_REGISTRY = registry)
#'
#' # pure R implementation (Rcpp implementation fails on Windows in vanilla mode)
Expand Down

0 comments on commit bb1b4e1

Please sign in to comment.