diff --git a/NAMESPACE b/NAMESPACE index 7719f46..beb025d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,12 +5,12 @@ export(as_annotation) export(as_subcorpus) export(dbpedia_get_wikidata_uris) export(dbpedia_spotlight_status) -export(map_types_to_class) export(namespaced_xpath) export(sparql_query) export(wikidata_query) export(xml_enrich) exportMethods(add_wikidata_uris) +exportMethods(entity_types_map) exportMethods(get_annotation_table) exportMethods(get_dbpedia_uris) import(methods) diff --git a/R/dbpedia.R b/R/dbpedia.R index dfbdd7d..363fc23 100644 --- a/R/dbpedia.R +++ b/R/dbpedia.R @@ -354,7 +354,19 @@ setGeneric("get_dbpedia_uris", function(x, ...) standardGeneric("get_dbpedia_uri #' types = "Company", #' api = "http://api.dbpedia-spotlight.org/en/annotate" #' ) -setMethod("get_dbpedia_uris", "character", function(x, language = getOption("dbpedia.lang"), max_len = 5600L, confidence = 0.35, api = getOption("dbpedia.endpoint"), types = character(), support = 20, verbose = TRUE){ +setMethod( + "get_dbpedia_uris", "character", + function( + x, + language = getOption("dbpedia.lang"), + max_len = 5600L, + confidence = 0.35, + api = getOption("dbpedia.endpoint"), + types = character(), + support = 20, + types_src = c("DBpedia", "Wikidata"), + verbose = TRUE + ){ if (nchar(x) > max_len){ if (verbose) cli_alert_warning( @@ -425,6 +437,28 @@ setMethod("get_dbpedia_uris", "character", function(x, language = getOption("dbp ) } )] + + if (length(types_src) > 0L){ + src_all <- unique(unlist(lapply(resources_min[["types"]], names))) + src_unused <- setdiff(src_all, types_src) + if (length(src_unused) > 0L) + cli_alert_info( + "dropping available types from: {paste(src_unused, collapse = ' / ')}" + ) + for (src in types_src){ + types_vec <- unlist(lapply( + lapply(resources_min[["types"]], `[[`, src), + function(x){ + if (is.null(x)) + NA_character_ + else + sprintf("|%s|", paste(x, collapse = "|")) + } + )) + + resources_min[, (paste(src, "type", sep = "_")) := types_vec] + } + } resources_min }) @@ -475,6 +509,11 @@ setMethod("get_dbpedia_uris", "AnnotatedPlainTextDocument", function(x, language #' vector is empty (default), no restrictions are applied. #' @param support The number of indegrees at Wikidata. Useful for limiting the #' the number of results by excluding insignificant entities. +#' @param types_src A `character` vector specifying knowledge bases as sources +#' for entity types. If provided, columns following the pattern '(src)_type' +#' (e.g. "DBpedia_types") with entity types (`NA` if not available) will be +#' added to the table. Values are wrapped and separated by vertical bars. +#' `types_src` defaults to "DBpedia" and "Wikidata". #' @param verbose A `logical` value - whether to display messages. #' @param progress A `logical` value - whether to show progress. #' @param s_attribute A length-one `character` vector indicating a s-attribute. @@ -765,7 +804,8 @@ setMethod( verbose = if (progress) FALSE else verbose )[, "doc" := docname] } - ) + ), + fill = TRUE ) if (progress) cli_progress_done(.envir = env) diff --git a/R/entity_types.R b/R/entity_types.R index b9b72eb..be8cee5 100644 --- a/R/entity_types.R +++ b/R/entity_types.R @@ -12,19 +12,24 @@ #' @param other a `character vector` with the name of the class of all types not #' matched by the `mapping_vector`. #' @param verbose A `logical` value - whether to display messages. +#' @param ... Further arguments. #' @importFrom data.table is.data.table #' @importFrom cli format_error cli_alert_info #' @details If there is more than one match between the retrieved types and the #' `mapping vector`, unique classes are sorted alphabetically and collapsed. #' @return Function adds classes to input data.table by reference. -#' @exportMethod generic +#' @exportMethod entity_types_map #' @rdname entity_types_map -setGeneric("entity_types_map", function(x, ...) - standardGeneric("entity_types_map")) +setGeneric( + "entity_types_map", + function(x, ...) standardGeneric("entity_types_map") +) #' @rdname entity_types_map #' @examples +#' library(quanteda) +#' #' inaugural_paragraphs <- data_corpus_inaugural %>% #' corpus_subset(Year == 2009) %>% # limit to Barack Obama 2009 #' corpus_reshape(to = "paragraphs") @@ -116,4 +121,4 @@ setMethod( x[, class := entity_types_map(x = x[["types"]])] x -} +}) diff --git a/man/map_types_to_class.Rd b/man/entity_types_map.Rd similarity index 51% rename from man/map_types_to_class.Rd rename to man/entity_types_map.Rd index 953bec7..377d176 100644 --- a/man/map_types_to_class.Rd +++ b/man/entity_types_map.Rd @@ -1,14 +1,22 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\name{map_types_to_class} -\alias{map_types_to_class} +% Please edit documentation in R/entity_types.R +\name{entity_types_map} +\alias{entity_types_map} +\alias{entity_types_map,list-method} +\alias{entity_types_map,data.table-method} \title{Map types returned by DBpedia Spotlight to a limited set of classes} \usage{ -map_types_to_class(x, mapping_vector, other = "MISC", verbose = TRUE) +entity_types_map(x, ...) + +\S4method{entity_types_map}{list}(x, mapping_vector, other = "MISC", verbose = TRUE) + +\S4method{entity_types_map}{data.table}(x, mapping_vector, other = "MISC", verbose = TRUE) } \arguments{ \item{x}{A \code{data.table} with DBpedia URIs.} +\item{...}{Further arguments.} + \item{mapping_vector}{A \verb{named character vector} with desired class names (as names) and types from the DBpedia ontology as values. For example: c("PERSON" = "DBpedia:Person"). Can contain more than one pair of class and @@ -31,3 +39,31 @@ function is to reduce the number of types to a limited set of classes. If there is more than one match between the retrieved types and the \verb{mapping vector}, unique classes are sorted alphabetically and collapsed. } +\examples{ +library(quanteda) + +inaugural_paragraphs <- data_corpus_inaugural \%>\% + corpus_subset(Year == 2009) \%>\% # limit to Barack Obama 2009 + corpus_reshape(to = "paragraphs") + +uritab_paragraphs <- get_dbpedia_uris( + x = inaugural_paragraphs, + language = "en", + max_len = 5600L, + confidence = 0.5, + api = "http://api.dbpedia-spotlight.org/en/annotate", + verbose = FALSE, + progress = TRUE +) + +mapping_vector = c( + "PERSON" = "DBpedia:Person", + "ORGANIZATION" = "DBpedia:Organisation", + "LOCATION" = "DBpedia:Place" +) + +entity_types_map( + uritab_paragraphs[["types"]], + mapping_vector = mapping_vector +) +} diff --git a/man/get_dbpedia_uris.Rd b/man/get_dbpedia_uris.Rd index dc1488d..9a709e2 100644 --- a/man/get_dbpedia_uris.Rd +++ b/man/get_dbpedia_uris.Rd @@ -21,6 +21,7 @@ get_dbpedia_uris(x, ...) api = getOption("dbpedia.endpoint"), types = character(), support = 20, + types_src = c("DBpedia", "Wikidata"), verbose = TRUE ) @@ -119,6 +120,12 @@ vector is empty (default), no restrictions are applied.} \item{support}{The number of indegrees at Wikidata. Useful for limiting the the number of results by excluding insignificant entities.} +\item{types_src}{A \code{character} vector specifying knowledge bases as sources +for entity types. If provided, columns following the pattern '(src)_type' +(e.g. "DBpedia_types") with entity types (\code{NA} if not available) will be +added to the table. Values are wrapped and separated by vertical bars. +\code{types_src} defaults to "DBpedia" and "Wikidata".} + \item{verbose}{A \code{logical} value - whether to display messages.} \item{p_attribute}{The p-attribute used for decoding a \code{subcorpus} object.}