In [None]:
#this line is needed when running the code in Google Colab because otherwise some packages won't work
system2('sudo', 'apt-get install libgsl0-dev')

In [None]:
install.packages("gutenbergr")
install.packages("tidytext")
install.packages("data.table")
install.packages("topicmodels")
install.packages("reshape2")

In [None]:
library(dplyr)
library(data.table)
library(gutenbergr)
library(tidytext)
library(topicmodels)
library(tidyr)
library(reshape2)

In [None]:
##list of the modern books and their summaries
gBooks<-read.csv("books.csv")
names(gBooks)[1] <- "title"

In [None]:
#topics for the modern books, gained from running the modern books through the LDA model
GldaTopics <- read.csv("LDAtopics.csv")

GldaTopics <- subset(GldaTopics, select = c(title, LDAtopics))

In [None]:
#modern books merged with the LDA topics into one dataset
gBooks <- merge(x=gBooks, y=GldaTopics, by = "title", all.y=TRUE)

In [None]:
#top 5 words for each topic, gained through the LDA model
TopWbTopic <- read.csv("SummaryTopTermsbyTopic.csv")

TopWbTopic<- subset(TopWbTopic, select = c(topic, term))

In [None]:
# all the top words added into a single cell
TopWbTopic<- as.data.table(TopWbTopic)[, toString(term), by = list(topic)]

#renaming one of the columns
names(TopWbTopic)[1] <- "LDAtopics"

In [None]:
#merging with modern books dataset, now the dataset also has a row for the top words related to each topic
gBooks2 <- merge(x=TopWbTopic, y=gBooks, by = "LDAtopics", all.y=TRUE)

In [None]:
#sampling of books to create a dummy user with random favourite books
set.seed(20)
SampleUser <- sample_n(gBooks2, 5)

In [None]:
#sample of classical books

fictionalBooks<-read.csv("fictionalBooks.csv")

set.seed(20)
sample <- sample_n(fictionalBooks, 1000)

In [None]:
bookText = data.frame()

#getting the text for the sample of classical books
for (i in 1:dim(sample)[1]){
  bookText <- rbind(bookText, gutenberg_download(sample$gutenberg_id[i], meta_fields = "title", mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/"))
}

In [None]:
#subsetting the useful parts of the sample users' preferences
sampleUserBookData <- subset(SampleUser, select = c(V1, title, Summary))

In [None]:
#merge summary and top words of the specific topic into one column/cell
sampleUserBookData <- transform(sampleUserBookData, text=paste(Summary, V1, sep=" "))

sampleUserBookData <- subset(sampleUserBookData, select = c(title, text))

bookText<- subset(bookText, select = c(text, title))

In [None]:
#putting the data from sample users' faves and classical book data into one dataframe
FbookText <- rbind(bookText, sampleUserBookData)

In [None]:
#common names that are going to be taken out alongside the stopwords
names <- read.csv("names.csv")

names$name <- tolower(names$name)

In [None]:
#additional stopwords list for removal for classical books
additionalWords <- read.csv("AdditionalWordsToRemove.csv")

additionalWords$Words <- tolower(additionalWords$Words)

additionalWords


In [None]:
#additional stopwords list for removal for modern books
RemoveWords <- read.csv("Remove.csv")

RemoveWords$Words <- tolower(RemoveWords$Words)

RemoveWords 

In [None]:
#cleaning up the data frame to make it ready for the dtm
by_chapter <- FbookText %>%
  group_by(title) %>%
  ungroup()

# split into words

by_chapter_word <- by_chapter %>%
  unnest_tokens(word, text)


In [None]:
# find document-word counts, remove stopwords
word_counts <- by_chapter_word %>%
  filter(!word %in% c(stop_words$word, names$name, additionalWords$Words, RemoveWords$Words)) %>%
  count(title, word, sort = TRUE) %>%
  ungroup()


In [None]:
#TF-IDF to reduce the amount of words, only those with a tf-idf score thats above average are kept
word_counts = word_counts %>% bind_tf_idf(word, title, n) %>%
  arrange(desc(tf_idf))

avg = mean(word_counts$tf_idf)

word_counts = subset(word_counts, tf_idf>avg)

In [None]:
##a list of all the book titles used for the current run
documents <- as.data.table(unique(word_counts$title))

In [None]:
#document term matrix
chapters_dtm <- word_counts %>%
  cast_dtm(title, word, n)

chapters_dtm

m2 <- as.matrix(chapters_dtm)

In [None]:
#LDA

chapters_lda <- LDA(chapters_dtm, k = 9, control = list(seed = 100))
chapters_lda

In [None]:
#assigning topics for each of the books
chapters_gamma <- tidy(chapters_lda, matrix = "gamma")
chapters_gamma

chapters_gamma <- chapters_gamma %>%
  separate(document, c("title"), sep = "_", convert = TRUE)

chapter_classifications <- chapters_gamma %>%
  group_by(title) %>%
  slice_max(gamma) %>%
  ungroup()

book_topics <- chapter_classifications %>%
  count(title, topic) %>%
  group_by(title) %>%
  top_n(1, n) %>%
  ungroup() %>%
  transmute(consensus = title, topic)


names(book_topics)[1] <- "title"
names(book_topics)[2] <- "LDAtopics"

In [None]:
#see which topics were assigned for the books liked by the user 
FavBtopics <- merge(x=book_topics, y=sampleUserBookData, by = "title", all.y=TRUE)

FavBtopics

In [None]:
#makes a list of the topics, one number for each of the books in the list of favourite books by the sample user
Topic <- FavBtopics$LDAtopics

In [None]:
#puts all of the books with the same topic as the books liked by the user into one dataframe
SimilarBooksByTopic <- data.frame()

j=1
TopicNum <- length(Topic)
for (j in 1:TopicNum){
  sample <- filter(book_topics, LDAtopics %in% c(Topic[j]))
  
  SimilarBooksByTopic<- rbind(SimilarBooksByTopic, sample)
  
  SimilarBooksByTopic <- unique(SimilarBooksByTopic)
}

In [None]:
#to the dataframe containing only books of the same topic, the words related to those books are added back in
word_counts2 <- merge(x=word_counts, y=SimilarBooksByTopic, by = "title", all.y=TRUE)


In [None]:
#dtm of only the books that have the same topic
similarB_dtm <- word_counts2 %>%
  cast_dtm(title, word, n)

similarB_dtm

m2Sim <- as.matrix(similarB_dtm)

In [None]:
#cosine similarity matrix
sim <- m2Sim / sqrt(rowSums(m2Sim * m2Sim))
sim <- sim %*% t(sim)

In [None]:
#list of the titles of the books 'liked' by the sample user
LikedBooks <- sampleUserBookData$title

#the number of books in the list
numB <- length(LikedBooks)


In [None]:
#a for loop to add the 5 most similar books into a dataframe, loops through each of the books 'liked' by the user and keeps adding 5 for each of the books
recommendations <- data.frame()

k=1
for (k in 1:numB){
  simBooksSample <- subset(sim, select=c(LikedBooks[k]))
  
  simBooksS22 <- simBooksSample[order(-simBooksSample[,1]),]
  
  simBooksS22 <- as.data.frame(simBooksS22)
  
  setDT(simBooksS22, keep.rownames = "titles")[]
  
  simBooksS22 <-filter(simBooksS22, !titles %in% c(sampleUserBookData$title))
  
  RecBooks <- simBooksS22 %>%
    slice(1:5)

  recommendations <- rbind(recommendations, RecBooks)
  
}


In [None]:
#row names are made into the 1st column
setDT(recommendations, keep.rownames = "titles")[]

In [None]:
#if the list contains any of the books originally in the user's 'favourites' list, they get removed here
recommendations <- filter(recommendations, !titles %in% c(sampleUserBookData$title))

In [None]:
#the final remaining books are sorted, the ones with the highest similarity rating at the top
toRec <- recommendations[order(-recommendations[,2]),]

In [None]:
#top 5 similar books get picked
toRec <- toRec %>%
  slice(1:5)


In [None]:
fRec<- subset(toRec, select = c(titles, simBooksS22))
names(fRec)[2] <- "Similarity"
fRec

In [None]:
write.csv(fRec,'recommendations&similarities5.csv')

In [None]:
#the dataframe with the final 5 recommendations
finalRecommendations <- subset(toRec, select = c(titles))

finalRecommendations

In [None]:
write.csv(finalRecommendations,'recommendations5.csv')