-
Notifications
You must be signed in to change notification settings - Fork 1
/
Text Sectioning_and_Disaggregation (R).R
498 lines (382 loc) · 20 KB
/
Text Sectioning_and_Disaggregation (R).R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
#@ PackageList
#install.packages("beepr") # this beeps!
#install.packages(c("RCurl", "XML")) #XML stuff
#install.packages(c("dplyr", "ggplot2"))
#install.packages("githubinstall")
#install.packages("stringr")
### githubinstall("htmlToText") ### appear to be multiple packages named this
library(beepr) #@ this library is needed to use the beeps that indicate progress!
#@ these other libraries are helpful for extending this project
#library(RCurl)
#library(XML)
#library(dplyr)
#library(ggplot2)
#library(githubinstall)
#library(stringr)
######################################################################
#Library R Work-File updated 6/4/2019 ## Converting data
##### clean your workspace __ rm(list=ls())
#
########################################################################
## This creates the most important function: IMPORT THE TEXTS! ######
########################################################################
##### this function takes a vector of filenames and a directory path ###
### and returns a list of each item in the list as ordered words ####
########################################################################
make.file.word.v.l <- function(files.v, input.dir){
text.word.vector.l <- list()
for(i in 1:length(files.v)){
#@ read the files in the directory
text.v <- scan(paste(input.dir, files.v[i], sep="/"), what="character", sep="\n")
#@ convert to a single string
text.v <- paste(text.v, collapse=" ")
#@ we're making it one big line here - then taking it to lowercase, split on nonword chars
#@text.lower.v <- tolower(text.v) ## we do NOT want to 'tolower' this yet,
#@text.words.v <- strsplit(text.lower.v, "\\W") ## because we need uppercase 'CHAPTER's
text.words.v <- strsplit(text.v, "\\W")
#@ "\\W" finds everything that is a word
text.words.v <- unlist(text.words.v)
text.words.v <- text.words.v[which(text.words.v!="")]
#@ use index id as name of list
text.word.vector.l[[files.v[i]]] <- text.words.v
}
return(text.word.vector.l)}
########################################################################
########################################################################
############ The Text Processing Itself begins here:
########################################################################
#
#
########################################################################
############ Set your INITIAL PARAMETERS
## the initial directory calls work with SciFiPipeline folder located in DESKTOP directory
########################################################################
############################ SET THE INITIAL PARAMETERS
#### NOTE!! these three choices are your most CRITICAL
#@ decison making points!
use_subchapter <- TRUE
#@ this is a BOOLEAN value, to tell the code if you WANT subchapters or NOT
#@ if this is set as FALSE, the code will return disaggregated / bag of words
#@ text at the delimiter / chapter level
#@ if it is set as TRUE, the code will further break down delimiter sections /
#@ chapters into SMALLER sections of text, before then disordering the words
#@ in that subsection alphabetically
subch_threshold <- 1000
#@ the subch.thresh (subchapter LENGTH threshold)
#@ describes the longest chapter permitted (roughly) before sub-chapters will be created
sec_delimiter <- "CHAPTER"
#@ this is the initial delimiter for your sections - in standard BOOK projects,
#@ it is often by chapter, best (if possible) demarcated by *ALL CAPS* **CHAPTER**
###### !!!! be careful!!! If you need to use a delimiter other than the preset,
# be careful about any delimiter that might be used out of the delimiting context,
#@ or for random other reasons in the text itself! this will cause bad, mucked up output sections!
#
#
#
############################################ finally, remember(if you need) to (re) SET YOUR DIRECTORIES
proj.dir <- "~/Desktop/Extracted-Features-master/"
#@ where your project itself is (the parent folder for all others)
#@ This lin makes an object with the name of your directory,
#@ so that the directory can be reset to original location later on.
#@ working from the correct directories (folders) on your computer matter a lot for reading and writing files!
input.dir <- "text_data/" #@ change this if your data is in a differently named folder
#@ your input directory is the location of the raw data, starting from
#@ your project directory (since that's where the working directory is currently located)
#@ you are setting initial working directory to project directory, so this should be just inside
#@ the project directory
setwd(proj.dir)
#@ this sets our working directory (where R looks for and puts stuff) to the project directory
#@ Run the code as-is from your desktop (RECOMMENDED!!!)
#@ OR change the directory object to point to the
#@ location of the "SciFiPipeline" folder (replace 'Desktop' below as appropriate)
#@ NOTE!! Because of directories calls work differently.
#@ This is a platform interoperability issue, not an R issue
#dir.create("newFolder")
#@ this command creates new folders in the current directory (just in case that would help)
########################################################################
############ LOADING IN RAW DATA ##############
########################################################################
setwd(proj.dir) ### some lines of code (like this one) look redundant but
### are included to improve clarity
files.v <- dir(input.dir, "\\.txt$")
files.v #@ make sure you have the correct files - you can un-comment these lines to check your work
####@ **below is optional ** PICK JUST A FEW FILES instead of EVERY file (if you only want a subset)
### _ files.v <- files.v[c(2: 3)] ; files.v
################# 'my.corpus.l' is a LIST object is what contains THE TEXTS THEMSELVES
my.corpus.l <- make.file.word.v.l(files.v, input.dir)
beep("complete")
# files.v[1]
# my.corpus.l[[1]][1:20]
#@ view the first part of the first entry of your my.corpus.l
######### this loop uses a regular expression to remove '.txt' from file names
#@ (must do this **AFTER** reading data files, or you won't be able to read them!)
for(book in 1:length(files.v)){
files.v[book] <- gsub('.{4}$', '', files.v[book])}
#files.v
names(my.corpus.l) <- files.v
# names(my.corpus.l[1])
#@ DO NOT run this more than once! It will remove 4 characters EACH TIME!
#@ this could be improved for future code
########################################################################
####### Begin the proper DATA CLEANING ###########################
########################################################################
##########Starting with identifying instances of "CHAPTER" ##########
########################################################################
CH_inst.v <- c(1:200) #@ expected greatest # of chapters/book (no harm over-shooting this)
#files.v #@ your vector of file names
# (note that the files appear in your corpus in the same order
# so files.v will point to correct locations in your corpus)
CH_inst.a <- array(NA, dim=c(length(files.v), length(CH_inst.v)), dimnames=list(files.v, CH_inst.v))
#@ this is a (thusfar empty) matrix which will be used to store instances of CHAPTER in each book
#CH_inst.a[1:5,1:5] #@ code to visualize the corner of this to make sure it's ok
#### This portion actually Fills In the positions of each instance of a "CHAPTER" token
for (book in 1:length(files.v)){
beep("coin") # you get a beep for each book, as the loop starts on it
for (inst in 1:length(CH_inst.v)){
holder <- NA
holder <- which(my.corpus.l[[book]][] == sec_delimiter)
if (is.na(holder[1])){
holder <- 1}
CH_inst.a[book, 1:length(holder)] <- holder
}
if (length(holder) > 1){
print(paste(length(holder), "instances of Your Delimiter in", files.v[book]))}
else if (length(holder) == 1 & holder[1]==1){
print(paste("probably 0 instances of Your Delimiter in", files.v[book]))}
else{
print(paste("there is probably 1 Your Delimiter in", files.v[book]))}
}
beep("complete")
# CH_inst.a[,1:10]
#@ NOTES:
#@ if there are ZERO instances of CHAPTER in a book, the ENTIRE BOOK will be captured
#@ the code always gathers data through the last word at the end of the document
#@ if there is metadata at the END of a book, it should be put
#######@ at the beginning before **CHAPTER 1** instead OR removed entirely
########################################################################
########################################################################
############################# isolate CONSUMABLE CHAPTER contents #####
####### here we are "chunking out" the chapters based on the above #####
########################################################################
CH_id.v <- c("book", "ch#", "CHtext") #@ the header for data being stored
#CH_inst.v #@ (re-using this object from above) - expected greatest # chapters/book
#files.v #@ (re-using this object again, from the beginning) -- the vector of book titles
chapters.a <- array(NA,
dim=c(length(CH_id.v), length(CH_inst.v), length(files.v)),
dimnames=list(CH_id.v, CH_inst.v, files.v))
#@ remember, CH_inst.a (array data object created above)
#@ holds the positon of each "CHAPTER" token from each book
#@ we will be working from these data to actually isolate the chapters themselves
########## This multi-loop command determines (for each chapter within each book)
########## the start and end points of each chapter
########## and then isolates and stores the contents of each chapter
for(book in 1:length(files.v)){
beep("coin")
for(ch in 1:length(CH_inst.v)){
if(!is.na(CH_inst.a[book,ch])){
start<-NA
start <- CH_inst.a[book,ch]+2
if(!is.na(CH_inst.a[book,(ch+1)])){
end<-NA
end <- CH_inst.a[book, (ch+1)]-1
}else{
end<-NA
end <- length(my.corpus.l[[book]])
}
}else{break()}
#@ at this point we should have the start and end of the chapter of interest
chapters.a[1,ch,book] <- files.v[book]
chapters.a[2,ch,book] <- paste("ch", ch, sep="_")
holder <- NA
holder <- my.corpus.l[[book]][start:(end)]
holder <- tolower(holder)
ch_contents <- NA
ch_contents <- paste (holder, collapse=" ")
chapters.a[3,ch,book] <- ch_contents
}}
beep("complete")
#@ test to make sure the object was created correctly
#@ (the indexing here avoids printing chapter contents, which will fill the console!)
# chapters.a[1:2,1:3,1:2]
#@ see [book and chapter (NOT chapter contents), for first 3 chapters, for first 2 books]
## if you WANT to see a chapter (first one first book), use this: _ chapters.a[3,1,1]
####
########################################################################
############## isolate "Extracted Features" Bag of Words contents #####
### here we are "chunking out" bags of words based on the above ########
## this is almost the same loop as above, with a few small differences #
########################################################################
########################################################################
CH_id.v <- c("book", "ch#", "CHtext")
CH_inst.v #@ (again, re-using this) - expected greatest # chapters/book
files.v #@ (again, the vector of book titles)
disag.a <- array(NA, ## this new data object will store disaggregated contents
dim=c(length(CH_id.v), length(CH_inst.v), length(files.v)),
dimnames=list(CH_id.v, CH_inst.v, files.v))
####### The loop to isolate and capture bags of words per chapter
for(book in 1:length(files.v)){
beep("coin")
for(ch in 1:length(CH_inst.v)){
if(!is.na(CH_inst.a[book,ch])){
start<-NA
start <- CH_inst.a[book,ch]+2
if(!is.na(CH_inst.a[book,(ch+1)])){
end<-NA
end <- CH_inst.a[book, (ch+1)]
}else{
end<-NA
end <- length(my.corpus.l[[book]])
}
}else{break()}
#@ at this point we should have the start and end of the chapter of interest
disag.a[1,ch,book] <- files.v[book]
disag.a[2,ch,book] <- paste("ch", ch, sep="_")
holder <- NA
holder <- my.corpus.l[[book]][start:(end-1)]
low_holder <- tolower(holder)
alpha_holder <- sort(low_holder)
disag_contents <- NA
disag_contents <- paste (alpha_holder, collapse=" ")
disag.a[3,ch,book] <- disag_contents
}}
beep("complete")
# disag.a[1:2,1:3,1:2] #@ same indexing above; this should look identical
## if you WANT to see a bag of words (again, 1st book 1st chapter) :
#### _ disag.a[3,1,1]
########################################################################
########################################################################
######################## change 3d arrays to printable 2d matrices ####
########################################################################
########################################################################
# 3d arrays initially used because they are more flexible for capturing 3-dimensional data
# however, 2d matricies are needed if we want to print out a .csv
# file giving each book/chapter's name and its corresponding bag of words
# either would work for printing individual chapter bag of words files,
# we just do both using the 2d matrix for convenience
#@ the 3D Arrays we draw from (created above_:
# chapters.a
# disag.a
#@ 2D chapter matricies, listing book title and chapter # as a single string:
d2.chapters.a <- c("Book Title + Chapter #", "Consumable Content")
names(d2.chapters.a) <- c("Book+ch", "text")
d2.disag.a <- c("Book Title + Chapter #", "Disag Bag of Words")
names(d2.disag.a) <- c("Book+ch", "text")
#@ Filling out those 2d matrices
for(book in 1:length(files.v)){
beep("coin")
for(ch in 1:length(CH_inst.v)){
if(!is.na(chapters.a[1,ch,book])){
ch.holder <- NA
ch.holder <- chapters.a[,ch,book]
ch.holdera <- c(paste(ch.holder[1], ch.holder[2], sep="_"), ch.holder[3]) #@ put explanation of why 2 columns
d2.chapters.a <- rbind(d2.chapters.a, ch.holdera[])
}
if(!is.na(disag.a[1,ch,book])){
dg.holder <- NA
dg.holder <- disag.a[,ch,book]
dg.holdera <- c(paste(dg.holder[1], dg.holder[2], sep="_"), dg.holder[3])
d2.disag.a <- rbind(d2.disag.a, dg.holdera[])
}
}}
beep("complete")
#@ a good way to check that everythign worked correctly; ensure the dimensions are the same
# dim(d2.chapters.a); dim(d2.disag.a)
# d2.chapters.a[1:5,1] # this give the first 5 entries of the book+chapter# data
# d2.disag.a[1:5,1]
# dim(d2.chapters.a)
# dim(d2.disag.a) ## this is the TOTAL NUMBER OF CHAPTERS identified
# nrow(d2.disag.a) #@ contains one extra row -- the header!
########################################################################
########################################################################
############## take printable 2d CHAPTER matrix, makes subchapters ####
########################################################################
########################################################################
# View(head(d2.disag.a)) - for some reason, some 'text' may not display here in view setting. it's ok
maxSubCh <- 1000 #@ the maximum expected number of subchapters with wiggle room - make it big!
subch.a <- array(NA, dim=c(nrow(d2.disag.a), 2, maxSubCh))
subch.a[,,1] <- d2.chapters.a
subch.a <- subch.a[-1,,] #@ remove the unwanted "headers" from the 3dim data frame
##################################### Subchapter Chunk Loop
# using
#use_subchapter ### this only runs if TRUE
#@ this is a BOOLEAN value, to tell the code if you WANT subchapters or NOT
#subch_threshold #@ default value (above) is 1000, you can optimize this for your own needs
#@ NOTE -- we are only subsectioning chapters longer than **4/3 the threshold value**
#@ this works better for mathematical purposes, and means that subchapter lengths *fluctiate around*
#@ the length threshold, rather than always being lower than the threshold
if(use_subchapter){
for(ch in 1:nrow(subch.a)){
contents.l <- NA; cont.v <- NA
contents.l <- strsplit(subch.a[ch,2,1], "\\W")
cont.v <- unlist(contents.l)
if(length(cont.v) < (4/3)*subch_threshold){
print(paste(subch.a[ch,1,1] ,"is Shorter than", floor(subch_threshold*4/3), "words"),sep=" ")
#@ a 'short chapter' is one short enough that (based on threshold) will NOT be split
subch.a[ch,1,2] <- paste(subch.a[ch,1,1], "0", sep="_") ## NEW NAME, _0 indc. only subCh
cont.v <- sort(cont.v)## full contents
cont.v <- paste(cont.v, collapse = " ")
subch.a[ch,2,2] <- cont.v
}else{
print(paste(subch.a[ch,1,1] ,"is a ~~~~~~ Longer ~~ chapter"),sep=" ")
#@ longer chapters WILL be split into subsections
ch_leng <- NA; n_ss <- NA #chapter length; number subsections;
subsec <- NA # subsection sequence (used for chunking subsections)
ss_pos.a <- NA # will be a n_ss by 2 array of subsect start and stop points
subsec.v <- NA; alpha_subsec.v <- NA; subsection <- NA
ch_leng <- length(cont.v)
n_ss <- ceiling(ch_leng / subch_threshold)
subsec <- seq(1:n_ss)
ss_pos.a <- array(NA, dim=c(n_ss,2))
ss_pos.a[,1] <- ( (subsec-1)*ch_leng/n_ss )+1
ss_pos.a[,2] <- ( subsec*ch_leng/n_ss) #@ this and prior line fill in subsec start and stop vals
for(ss in 1:n_ss){
subsect.v <- NA; subsection <- NA # subsect.v - vector of words, subsection - pasted back together
subsect.v <- cont.v[ss_pos.a[ss,1]:ss_pos.a[ss,2]]
alpha_subsect.v <- sort(subsect.v)
subsection <- paste(alpha_subsect.v, collapse=" ")
subch.a[ch,1,ss+1] <- paste(subch.a[ch,1,1], ss, sep="_") ##NEW NAME, _ss#
subch.a[ch,2,ss+1] <- subsection ## the subsection contents
print(paste(" ssec", ss, "is named", subch.a[ch,1,ss+1], sep=" "))
}}
}}
# dim(subch.a[,,])
########################################################################
############## take new 3D subchapter matrix, make printable 2D form ####
########################################################################
disag_subsec.a <- c("Book Title, Chapter & SubSec #", "Disaggregated BOW Content")
names(disag_subsec.a) <- c("Book+ch+ss", "text")
for(ch in 1:nrow(subch.a)){
for(ss in 2:maxSubCh){ #@ this loop starts at 2 because the first layer is the ORIGINAL chapters
if(!is.na(subch.a[ch, 1, ss])){
ss.holder <- NA
ss.holder <- subch.a[ch,,ss]
disag_subsec.a <- rbind(disag_subsec.a, ss.holder)
}
}}
# nrow(disag_subsec.a)
# disag_subsec.a[2,]
########################################################################
########################################################################
######################## Write the Outputs! Both matrices and BOWs ####
########################################################################
########################################################################
dir.create("matrix_output", showWarnings = F)
?dir.create
matrix.dir <- "/matrix_output"
dir.create("bags_of_words", showWarnings = F)
BoW.dir <- "/bags_of_words"
setwd(paste(proj.dir,matrix.dir, sep=""))
############# The Outputs, based on if you are using subsections or not
if(use_subchapter){ #@ if we are using subchapters, give subchapters, otherwise give disag chapters
write.csv(disag_subsec.a, "Corpus_ExtractedFeatures_SUBSECTIONS_BagsOfWords.csv", row.names=F)
setwd(paste(proj.dir, BoW.dir, sep=""))
for(ss in 1:nrow(disag_subsec.a)){
write.table(disag_subsec.a[ss,2], file=paste(disag_subsec.a[ss, 1],"_SSExtFeat.txt",sep=""))}
}else{
#write.csv(d2.chapters.a, "Corpus_Chapters_Consumable.csv", row.names=F) #@ this writes in-order chapters!
write.csv(d2.disag.a, "Corpus_ExtractedFeatures_BagsOfWords.csv", row.names=F)
setwd(paste(proj.dir, BoW.dir, sep=""))
for(book_ch in 2:nrow(d2.disag.a)){
write.table(d2.disag.a[book_ch,2], file=paste(d2.disag.a[book_ch, 1],"_ExtFeat.txt",sep=""))}
}
#@ THE END