In [35]:
library("readxl")
library(rvest) # scrap web pages
library(reticulate)  # R & python interchange
library(tidyverse)

## Import BIB data
- https://bibliometrix.org/documents/bibliometrix_Report.html
- Place your bibtex files in bibs folder

In [21]:
# Reading Bibtex
# assuming that you are in the same directory as this snippet
library(bibliometrix)
bibs <- list.files("bibs/", pattern = "bib$", recursive = T, full.names = T)
bibs # visualize the files, index start at 1

In [None]:
M <- convert2df(file=bibs[2], dbsource = "isi", format = "bibtex")

## Import xlsx data
File extracted from StArt Tool

In [None]:
my_data <- read_excel("bibs/arquivos.xlsx")
sub_data <- my_data[my_data$`Status/Selection` == "ACCEPTED",]
names(sub_data) # visualize columns names

# Export to document
Working

# Manipulating the data

In [28]:
# create new column combining title, abstract and keywords
sub_data <- sub_data %>%
mutate(fog = paste(sub_data$Title, sub_data$Abstract, sub_data$Keywords))

In [29]:
# capture only these columns
temp_subdata <- sub_data %>%
select (Title, DOI, Year, fog)

### Topic Modeling

In [53]:
# transform column fog to Document term matrix
library(quanteda)
data.dtm <- corpus(temp_subdata, text_field = "fog")
data.dtm <- dfm(data.dtm)
data.dtm

Document-feature matrix of: 204 documents, 7,363 features (98.3% sparse) and 3 docvars.
       features
docs    from high-level modeling toward efficient and trustworthy circuits
  text1    1          1        1      1         2  10           1        1
  text2    2          0        0      0         0   9           0        0
  text3    0          0        1      0         0   5           0        0
  text4    0          0        0      0         0   5           0        0
  text5    0          0        0      0         0   2           0        0
  text6    0          0        0      0         0   7           0        0
       features
docs    behavior-interaction-priority (
  text1                             1 3
  text2                             0 2
  text3                             0 1
  text4                             0 0
  text5                             0 2
  text6                             0 1
[ reached max_ndoc ... 198 more documents, reached max_nfeat ... 7,353 more

In [61]:
library(tidytext)
data.dtm <- tidy(data.dtm) %>% cast_dtm(document, term, count)

In [64]:
library(ldatuning)
library(topicmodels)
tunningresult <- FindTopicsNumber(
data.dtm,
topics = seq(from = 2, to = 7, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
control = list(seed = 77),
mc.cores = 1L,
verbose = TRUE)

fit models... done.
calculate metrics:
  Griffiths2004... done.
  CaoJuan2009... done.
  Arun2010... done.
  Deveaud2014... done.


### Stemming with Spacy
- https://spacy.io - substantive subordinate operations

In [30]:
spacy = import("spacy")
nlp = spacy$load('en_core_web_sm')