In [None]:
#this line is needed when running the code in Google Colab because otherwise some packages won't work
system2('sudo', 'apt-get install libgsl0-dev')

install.packages("data.table")
install.packages("tm")
install.packages("topicmodels")
install.packages("tidytext")
install.packages("gutenbergr")
install.packages("reshape2")
install.packages("factoextra")
install.packages("kohonen")
library(gutenbergr)
library(curl)
library(dplyr)
library(ggplot2)
library(data.table)
library(tm)
library(topicmodels)
library(tidytext)
library(tidyr)
library(stringr)
library(scales)
library(reshape2)
library(rmarkdown)
library(tidyverse)
library(factoextra)
library(kohonen)

In [2]:
#getting the classical books from gutenbergr package
gutdata <- gutenberg_works()

gutData <- gutdata[!is.na(gutdata$title),]

subData <- gutenberg_subjects

subData <- subData[!(subData$subject_type=="lcc"),]

subData <- as.data.table(subData)[, toString(subject), by = list(gutenberg_id, subject_type)]

#just the books that are fictional by subject
ficBooks <- dplyr::filter(subData, grepl('fiction|Fiction|fantasy|Adventure|Fables|Fairy tales', V1))

ficBooksExtra <- merge(x=gutData, y=ficBooks, by = "gutenberg_id", all.y=TRUE)

ficBooksExtra <- ficBooksExtra[!is.na(ficBooksExtra$title),]

subData <- dplyr::filter(subData, !grepl('fiction|Fiction|fantasy|Adventure|Fables|Fairy tales', V1))

#13k records, the main list of all the fictional classical books from gutenbergr website
write.csv(ficBooksExtra,'fictionalBooks.csv')

In [3]:
#taking a random sample of books from the main classical books dataset.
#more than 1 thousand books are selected to account for occasional errors when downloading the book text. 
fictionalBooksP1<-read.csv("fictionalBooks.csv")
sample <- sample_n(fictionalBooksP1, 1100)


In [None]:
#downloading the classical book text for all of the books within the 'sample' dataset
bookText = data.frame()

for (i in 1:dim(sample)[1]){
  bookText <- rbind(bookText, gutenberg_download(sample$gutenberg_id[i], meta_fields = "title", mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/"))
}

write.csv(bookText, 'bookText.csv')


In [None]:
#common names dataset which is used for removing majority of the names along with stopwords
names <- read.csv("names.csv")
names$name <- tolower(names$name)


In [None]:
#the dataset of additional stopwords to be removed
additionalWords <- read.csv("AdditionalWordsToRemove.csv")

additionalWords$Words <- tolower(additionalWords$Words)


In [None]:
#going through the book text, removing the stopwords and names, splitting the text into single words

bookText <- read.csv("bookText.csv")

by_chapter <- bookText %>%
  group_by(title) %>%
  mutate(chapter = cumsum(str_detect(
    text, regex("^chapter ", ignore_case = TRUE)
  ))) %>%
  ungroup()

# split into words
by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)

# find document-word counts
word_counts <- by_chapter_word %>%
  filter(!word %in% c(stop_words$word, names$name, additionalWords$Words)) %>%
  count(title, word, sort = TRUE) %>%
  ungroup()

word_counts

In [None]:
#TF-IDF scores, removing words below the average score
word_counts = word_counts %>% bind_tf_idf(word, title, n) %>%
  arrange(desc(tf_idf))

avg = mean(word_counts$tf_idf)

word_counts = subset(word_counts, tf_idf>avg)

In [None]:
#reducing the sample book list to one thousand. Initially more than one thousand were selected to account for occasional errors when downloading the text of some of the books.
#having a number such as 100 or 1000 makes it easier to later set up correctly-sized SOM grid
BookList <- as.data.frame(unique(word_counts$title))
names(BookList)[1] <- "title"

BookList <- sample_n(BookList, 1000)

word_counts <- merge(x=word_counts, y=BookList, by = "title", all.y=TRUE)

write.csv(word_counts,'ClassicalBookWordcounts.csv')

In [None]:
#used for K-means and SOM, just a list of all the books used for the analysis
documents <- as.data.table(unique(word_counts$title))

In [None]:
#dtm (document term matrix)
chapters_dtm <- word_counts %>%
  cast_dtm(title, word, n)

chapters_dtm

m2 <- as.matrix(chapters_dtm)

LDA

In [None]:
chapters_lda <- LDA(chapters_dtm, k = 9, control = list(seed = 12))
chapters_lda

Visualisation for top N words for each topic

In [None]:
chapter_topics <- tidy(chapters_lda, matrix = "beta")
chapter_topics

write.csv(chapter_topics,'ClassicalBookWordsforTopics.csv')


In [None]:
top_terms <- chapter_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms

In [None]:
write.csv(top_terms,'ClassicalBookTopTermsByTopic.csv')

In [None]:
png(file="1.1.png" ,width=4000,height=4000, res = 300, bg="white")
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()
dev.off()

Assigning a topic for each of the books.

In [None]:
chapters_gamma <- tidy(chapters_lda, matrix = "gamma")
chapters_gamma


chapters_gamma <- chapters_gamma %>%
  separate(document, c("title", "chapter"), sep = "_", convert = TRUE)

chapters_gamma

In [None]:
chapter_classifications <- chapters_gamma %>%
  group_by(title, chapter) %>%
  slice_max(gamma) %>%
  ungroup()

chapter_classifications

In [None]:
book_topics <- chapter_classifications %>%
  count(title, topic) %>%
  group_by(title) %>%
  top_n(1, n) %>%
  ungroup() %>%
  transmute(consensus = title, topic)


In [None]:
names(book_topics)[1] <- "title"
names(book_topics)[2] <- "LDAtopics"

write.csv(book_topics, 'LDAtopics.csv')

K-means

In [None]:
set.seed(100)

In [None]:
kmBooks2 <- scale(m2)
head(kmBooks2)

In [None]:
kBmean <- kmeans(kmBooks2, centers=9, nstart=25)

In [None]:
kBmean$cluster

In [None]:
Kclusters <- documents %>%
  mutate(Cluster = kBmean$cluster) %>%
  group_by(Cluster)

In [None]:
Kclusters

In [None]:
names(Kclusters)[1] <- "title"
names(Kclusters)[2] <- "KmeansClusters"

write.csv(Kclusters, 'K-meansClusters.csv')

Some visualisations for seeing how the clusters (at different values) are distributed. Only suitable for small amounts of data.

In [None]:
fviz_cluster(kBmean, data = kmBooks2)

In [None]:
k2 <- kmeans(kmBooks2, centers = 9, nstart = 25)
k3 <- kmeans(kmBooks2, centers = 6, nstart = 25)
k4 <- kmeans(kmBooks2, centers = 3, nstart = 25)
k5 <- kmeans(kmBooks2, centers = 12, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = kmBooks2) + ggtitle("k = 9")
p2 <- fviz_cluster(k3, geom = "point",  data = kmBooks2) + ggtitle("k = 6")
p3 <- fviz_cluster(k4, geom = "point",  data = kmBooks2) + ggtitle("k = 3")
p4 <- fviz_cluster(k5, geom = "point",  data = kmBooks2) + ggtitle("k = 12")


png(file="1.png" ,width=4000,height=4000, res = 300, bg="white")
grid.arrange(p1, p2, p3, p4, nrow = 2)
dev.off()

SOM (Self Organising Maps)

In [None]:
#creates a grid for 100 x 10 books (1000); if a different sample size is used, the grid size should be adjusted to match it
som.grid <- somgrid(xdim = 100, ydim = 10, topo = 'hexagonal')

In [None]:
som.model <- som(data.matrix(m2), grid = som.grid)

In [None]:
som.events <- som.model$codes[[1]]

In [None]:
#post SOM processing
clusters <- kmeans(som.events, centers = 9, iter.max = 100, nstart = 10)$cluster

In [None]:
clustersSOM <- documents %>%
  mutate(ClusterSOM = clusters) %>%
  group_by(ClusterSOM)

In [None]:
names(clustersSOM)[1] <- "title"
names(clustersSOM)[2] <- "SOMclusters"


write.csv(clustersSOM, 'SOMClusters.csv')

Visuals that may not be suitable for large data samples

In [None]:
plot(som.model, type="changes")

In [None]:
plot(som.model, type="count", main="Node Counts")

In [None]:
# U-matrix visualisation
plot(som.model, type="dist.neighbours", main = "SOM neighbour distances")