.Rhistory

document <- tm_map(document, removeNumbers) # remove numbers from document
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
document <- tm_map(document, stripWhitespace) # removewhite space
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 8, control = list(seed = 1234))
model_lda
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
# topic 1 is about students mental limits, like maths difficulty
# topic 2 is about tech classroom
# topic 3 is about tech education
# topic 4 is about education, craft, tech subsject & curricululum
# filters terms by topics
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 10, control = list(seed = 1234))
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case
document <- tm_map(document, removeNumbers) # remove numbers from document
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
document <- tm_map(document, stripWhitespace) # removewhite space
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 10, control = list(seed = 1234))
model_lda
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
# topic 1 is about students mental limits, like maths difficulty
# topic 2 is about tech classroom
# topic 3 is about tech education
# topic 4 is about education, craft, tech subsject & curricululum
# filters terms by topics
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)
library(keras)
install.packages("keras")
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(keras)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
document
okenizer <- text_tokenizer(num_words = 100) %>%
#building the word index
fit_text_tokenizer(document)
# named list mapping words to their rank/index
word_index <- tokenizer$word_index
print(t(word_index))
yes
fit_text_tokenizer(document)
word_index <- tokenizer$word_index
print(t(word_index))
print
cat("Found", length(word_index), "unique tokens\n")
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
tokenizer <- text_tokenizer(num_words = 100) %>%
#building the word index
fit_text_tokenizer(document)
word_index <- tokenizer$word_index
print(t(word_index))
cat("Found", length(word_index), "unique tokens\n")
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(keras)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
count_words(mobydick)
#> mobydick
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
count_words(document)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
library(tokenizers)
options(max.print = 25)
options
suman <- paste0(document)
suman
suman
#Suman Das#
# we collected data from 22 central Universities
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(wordcloud)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
#document<- removeNumbers(document)
#document <- removeWords(document, Corpus("ufb","need","mnon"))
# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case
document <- tm_map(document, removeNumbers) # remove numbers from document
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
document <- tm_map(document, stripWhitespace) # removewhite space
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
#inspect(dtm)
findFreqTerms(dtm,5)
inspect(removeSparseTerms(dtm,0.4))
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
model_lda
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)
##
<-terms(model_lda, 5)
head(model_lda@gamma)
model_lda@beta[,1]
sum(model_lda@beta[1,])
sum(exp(model_lda@beta[1,]))
##topic <- 1
##df <- data.frame(term = model_lda@terms, p = exp(model_lda@beta[topic,]))
##head(df[order(-df$p),])
library(wesanderson)
wordcloud(document,
scale=c(4.5,0.5),
max.words=8,
min.freq=100,
random.order=FALSE,
rot.per=0.40)
gColors <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
gg.cols <- ggColors(8)
bp.cols<- c("light blue","cornflowerblue", "coral2", brewer.pal(8,"Dark2"))
wordcloud(document,
scale=c(5.5,0.5),
max.words=200,
min.freq=50,
random.order=FALSE,
rot.per=0.40,
use.r.layout=FALSE,
random.color=TRUE,
colors = bp.cols
#colors=brewer.pal(8, "Dark2")
)
# Top Words
Topics(document)
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(keras)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case
document <- tm_map(document, removeNumbers) # remove numbers from document
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
document <- tm_map(document, stripWhitespace) # removewhite space
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 8, control = list(seed = 1234))
model_lda
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
# topic 1 is about students mental limits, like maths difficulty
# topic 2 is about tech classroom
# topic 3 is about tech education
# topic 4 is about education, craft, tech subsject & curricululum
# filters terms by topics
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)
to_ldavis(model, pool$corpus, pool$document_term_matrix)
library(LDAvis)
install.packages("LDAvis")
#' Twenty Newsgroups Data
#'
#' @format A list elements extracted from a topic model fit to this data
#' \describe{
#'   \item{phi}{phi, a matrix with the topic-term distributions}
#'   \item{theta}{theta, a matrix with the document-topic distributions}
#'   \item{doc.length}{doc.length, a numeric vector with token counts for each document}
#'   \item{vocab}{vocab, a character vector containing the terms}
#'   \item{term.frequency}{term.frequency, a numeric vector of observed term frequencies}
#' }
#' @source \url{http://qwone.com/~jason/20Newsgroups/}
"TwentyNewsgroups"
require(tm)
require(tidytext)
#Suman Das#
# we collected data from 22 central Universities
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(wordcloud)
#load all PDF files
#Suman Das#
# we collected data from 22 central Universities
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(wordcloud)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
#document<- removeNumbers(document)
#document <- removeWords(document, Corpus("ufb","need","mnon"))
# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case
document <- tm_map(document, removeNumbers) # remove numbers from document
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
document <- tm_map(document, stripWhitespace) # removewhite space
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
#inspect(dtm)
findFreqTerms(dtm,5)
inspect(removeSparseTerms(dtm,0.4))
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 4, control = list(seed = 1234))
model_lda
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)
##
<-terms(model_lda, 5)
head(model_lda@gamma)
model_lda@beta[,1]
sum(model_lda@beta[1,])
sum(exp(model_lda@beta[1,]))
##topic <- 1
##df <- data.frame(term = model_lda@terms, p = exp(model_lda@beta[topic,]))
##head(df[order(-df$p),])
library(wesanderson)
wordcloud(document,
scale=c(4.5,0.5),
max.words=8,
min.freq=100,
random.order=FALSE,
rot.per=0.40)
gColors <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
gg.cols <- ggColors(8)
bp.cols<- c("light blue","cornflowerblue", "coral2", brewer.pal(8,"Dark2"))
wordcloud(document,
scale=c(5.5,0.5),
max.words=200,
min.freq=50,
random.order=FALSE,
rot.per=0.40,
use.r.layout=FALSE,
random.color=TRUE,
colors = bp.cols
#colors=brewer.pal(8, "Dark2")
)
# Top Words
Topics(document)
require(topicmodels)
require(pdftools)
require(tm)
require(tidytext)
require(ggplot2)
require(dplyr)
library(keras)
#load all PDF files
All_Files<-list.files(pattern = "pdf$")
All_opinions <-lapply(All_Files, pdf_text)
#create the corpus that containing words
document<-Corpus(VectorSource(All_opinions))
# cleaning data
document <- tm_map(document, content_transformer(tolower)) # convert all text to lower case
document <- tm_map(document, removeNumbers) # remove numbers from document
document <- tm_map(document, removeWords, stopwords("english")) # remove stopwords in English
document <- tm_map(document, removePunctuation, preserve_intra_word_dashes = TRUE)
document <- tm_map(document, stripWhitespace) # removewhite space
# Create document-term matrix
dtm <- DocumentTermMatrix(document)
# Create Model with 4 Topics
model_lda <- LDA(dtm, k = 8, control = list(seed = 1234))
model_lda
# Shows the probability of a word being associated to a topic
beta_topics <- tidy(model_lda, matrix = "beta") # create the beta model
beta_topics # shows all the information in the beta_topics
# Grouping the terms by topic
beta_top_terms <- beta_topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
# Display the grouped terms on the charts
beta_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()
# topic 1 is about students mental limits, like maths difficulty
# topic 2 is about tech classroom
# topic 3 is about tech education
# topic 4 is about education, craft, tech subsject & curricululum
# filters terms by topics
# Examining per document per topic probability
# gamme indicates how documents affects the topics
# For example, document 1, 6.4 gamma on topic 1, also document 4 is 9.99 on topic 1, document 4 is the highest gamma
gamma_documents <- tidy(model_lda, matrix = "gamma")
# create a dataframe with gamme results
doc_gamma.df <- data.frame(gamma_documents)
doc_gamma.df$chapter <- rep(1:dim(dtm)[1],4)
# plot gamma results
ggplot(data = doc_gamma.df, aes(x = chapter, y = gamma,
group = factor(topic), color = factor(topic))) + geom_line() + facet_wrap(~factor(topic), ncol = 1)
<-terms(model_lda, 5)
head(model_lda@gamma)
model_lda@beta[,1]
sum(model_lda@beta[1,])
sum(exp(model_lda@beta[1,]))
##topic <- 1
##df <- data.frame(term = model_lda@terms, p = exp(model_lda@beta[topic,]))
##head(df[order(-df$p),])
library(wesanderson)
wordcloud(document,
scale=c(4.5,0.5),
max.words=8,
min.freq=100,
random.order=FALSE,
rot.per=0.40)
gColors <- function(n) {
hues = seq(15, 375, length=n+1)
hcl(h=hues, l=65, c=100)[1:n]
}
gg.cols <- ggColors(8)
bp.cols<- c("light blue","cornflowerblue", "coral2", brewer.pal(8,"Dark2"))
wordcloud(document,
scale=c(5.5,0.5),
max.words=200,
min.freq=50,
random.order=FALSE,
rot.per=0.40,
use.r.layout=FALSE,
random.color=TRUE,
colors = bp.cols
#colors=brewer.pal(8, "Dark2")
)
# Top Words
Topics(document)