<a href="https://colab.research.google.com/github/Noelle-Pastor/Top-American-Authors-in-19th---21st-Century-Literary-Anthologies/blob/main/Querying_Author_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Querying Data From Source Files**
## Source Files
From each source PDF, two structured files were derived:
- **HIERARCHY** file: Represents the hierarchy structure of table of contents headings. Has fields: `Page Number`, `Author`, `Heading 1`, `Headng 2`, `Heading 3`, `Heading 4`, and `Heading 5`.  

- **METADATA** file: Stores info about the anthology itself. Fields: `Title`, `Author 1`, `Author 2`, `Author 3`, `Publish Date`, and `Pages`.

<br></br>
#### The following functions use the `tidyverse` library to load, clean, and query the source data into tables of author stats.

In [None]:
library(tidyverse)

In [None]:
# Directory where data files are stored
DATA_FOLDER = "insert_filepath_here"

OUTFILE = '19th_century_author_freq_totalpgs.txt'

Standardize Author Names

In [None]:
NAME_CORRECTIONS <- name_corrections <- c(
  "Abagail Smith Adams" = "Abagail Adams",
  "Abigail Adams To John Adams" = "Abagail Adams",
  "Alvar Nuñez Cabeza de Vaca" = "Alvar Núñez Cabeza de Vaca",
  "Álvar Núñez Cabeza de Vaca" = "Alvar Núñez Cabeza de Vaca",
  "Allen Ginsberg, Father Death Blues" = "Allen Ginsberg",
  "Andrew Jackson, From Message Of The President Of The United States To Both Houses Of Congress At The Commencement Of The Second Session Of The Twenty-First Congress, December 7, 1830" = "Andrew Jackson",
  "Anne Dudely Bradstreet" = "Anne Bradstreet",
  "Benjamin Banneker, Correspondence With Thomas Jefferson" = "Benjamin Banneker",
  "Benjamin Banneker To Thomas Jefferson" = "Benjamin Banneker",
  "Agustus B. Longstreet" = "Agustus Baldwin Longstreet",
  "Benjamin Tompson: New-Englands Crisis" = "Benamin Tompson",
  "Caroline Stansbury Kirkland" = "Caroline Mathilda Kirkland",
  "Catharine Maria Sedgwick" = "Catharine Sedgwick",
  "Charles Chauncy" = "Charles Chauncey",
  "Countee Cullen" = "Countée Cullen",
  "Davy Crockett" = "David Crockett",
  "E.e. Cummings" = "E. E. Cummings",
  "Ebenezer Cooke" = "Ebenezer Cook",
  "Edward Coate Pinkney" = "Edward Coote Pinkney",
  "Elias Boudinot / Gallegina" = "Elias Boudinot",
  "Eugene O’neill" = "Eugene O'neill",
  "Flannery O’connor" = "Flannery O'connor",
  "Flannery Oconnor" = "Flannery O'connor",
  "Frank O’hara" = "Frank O'hara",
  "Gustavus Vassa [Olaudah Equiano]" = "Gustavus Vassa",
  "Gertrude Simmons Bonnin" = "Gertrude Bonnin",
  'Harriet Ann Jacobs, "Linda Brent"' = "Harriet Ann Jacobs",
  "Henry James, Jr" = "Henry James",
  "Horace Greeley" = "Horace Gregory",
  "J. Hector St. Jean De Crevecoeur" = "J. Hector St. Jean De Crèvecoeur",
  "John Don Passos" = "John Dos Passos",
  "James M. Whitfield" = "James Monroe Whitfield",
  "Jane Johnston Schoolcraft / Bamewawagezhikaquay" = "Jane Johnston Schoolcraft",
  "Mary White Rowlandson" = "Mary Rowlandson",
  "Martin Luther King, Jr" = "	Martin Luther King Jr",
  "Martin Luther King" = "Martin Luther King Jr",
  "Mary Boykin Miller Chesnut" = "Mary Boykin Chesnut",
  "Mary E. Wilkins Freeman" = "Mary Wilkins Freeman",
  "Meridel Lesueur" = "Meridel Le Sueur",
  "Michael S. Harper" = "Michael Harper",
  "Michel Guillaume St. Jean De Crèvecoeur" = "Michel-Guillaume Jean De Crèvecoeur",
  "Michel-Guillaume Jean De Crèvecœur" = "Michel-Guillaume Jean De Crèvecoeur",
  "Mrs. Mary Rowlandson" = "Mary Rowlandson",
  "Mrs. Susanna Haswell Rowson" = "Susanna Haswell Rowson",
  "Richard Henry Dana" = "Richard Henry Dana, Jr",
  "Robert Lee Frost" = "Robert Frost",
  "Rolando Hinojosa-Smith" = "Rolando Hinojosa",
  "Samuel L. Clemens" = "Samuel Langhorne Clemens",
  "Sarah Wentworth Apthorp Morton" = "Sarah Wentworth Morton",
  "Sarah Winnemucca Hopkins" = "Sarah Winnemucca",
  "T.s. Eliot" = "T.S. Eliot",
  "Thomas Harriot" = "Thomas Hariot",
  "Tim O’brien" = "Tim O'brien",
  "Tomas Rivera" = "Tomás Rivera",
  "Tillie Lerner Olsen" = "Tillie Olsen",
  "W.e.b. Du Bois" = "W. E. B. Dubois",
  "W. E. B. Du Bois" = "W. E. B. Dubois",
  "William Apess" = "William Apes",
  "William Byrd, Ii" = "William Byrd Ii",
  'William Sydney Porter, "O. Henry"' = "William Sydney Porter"

Load Data into Tibble; Clean `author` field; and Identify Native American Literature

In [None]:
load_and_clean <- function(data_filepath) {
  # Load data into tibble
  data <- read_delim(
    data_filepath,
    delim = "\t",
    col_types = cols(.default = "c"),
    quote = ""
  )

  # clean author and page columns
  data <- data %>%
    drop_na(author) %>%
    mutate(`Page Number` = as.numeric(`Page Number`)) %>%
    drop_na(`Page Number`) %>%
    mutate(author = str_to_title(str_remove(author, "\\.$"))) %>%
    mutate(author = str_remove(author, " \\(.*\\)")) %>%
    mutate(author = str_remove(author, " \\[.*\\]")) %>%
    mutate(author = str_remove(author, "\\s*:.*")) %>%
    mutate(author = str_remove(author, ",.*")) %>%
    mutate(author = str_remove(author, " To .*")) %>%
    mutate(author = str_replace_all(author, NAME_CORRECTIONS)) %>%
    filter(author != "introduction") %>%
    filter(author != "Anonymous") %>%
    mutate(author = if_else(
      str_detect(author, "Native|Indigenous|Indian|Cherokee|Iroquois|Choctaw|Potawatomi|Lakota|Navajo|Abenaki|Acoma|Apache|Aztec|Blackfeet|Cayuga|Cheyenne|Chippewa|Chinook|Creek|Diné|Ho-Chunk|Hopi|Hupa|Inca|Inuit|Kiowa|Koasati|Kootenai|Miwok|Mohawk|Mohegan|Mohican|Muscogee|Nahua|Nahuatl|Odawa|Ojibwe|Oneida|Onondaga|Osage|O'odham|Paiute|Pequot|Pomo|Ponca|Powhatan|Pueblo|Quechua|Salish|Sauk|Seneca|Shawnee|Sioux|Spokane|Wampanoag|Winnebago|Zuni|Wannuaucon|Winnemucca"),
      "Native American Literature",
      author)) %>%
    arrange(`Page Number`)

  print(paste("Loaded and cleaned ", str_sub(data_filepath, -30)))
  return(data)
}

### Get Author Stats:
Output: `Num_Pages`, `Frequency`, and `Proportion`

Get page count `num_pages` for one author by subtracting author's first page number from the next listed entry's page number.  Set author `frequency` to 1.

In [None]:
get_author_stats <- function(data_tib) {
  #  author, total_num_pages, frequency



  total_source_pages <- max(data_tib$`Page Number`)

  print("Getting author stats from data...")
  current_authors_stats <- data_tib %>%

    # GET NUMBER OF PAGES PER AUTHOR
    group_by(author) %>%
    summarize(`Page Number` = min(`Page Number`)) %>%   # get starting pg. number for each heading1
    arrange(`Page Number`) %>%
    mutate(total_num_pages = lead(`Page Number`) - `Page Number`) %>%  #next author start pg - current author start pg = num_pages current author has in book
    mutate(total_num_pages = ifelse(is.na(total_num_pages),                  # handle case of final author having no "lead(`Page Number`)" to get num_pages
                                    total_source_pages - `Page Number`,
                                    total_num_pages))  %>%
    mutate(total_num_pages = ifelse(total_num_pages==0, 1, total_num_pages)) %>%    # if two authors begin on same page, give them each "1" page in book, not 0
    select(-`Page Number`) %>%

    # SET FREQUENCY
    mutate(frequency = 1) %>%
    arrange(desc(frequency), author)


  print("Stats extracted from data.")
  return(current_authors_stats)
}


Add one file's data to the master table. Sum page count `num_pages` and `frequency` for each author.

In [None]:
combine_tibs <- function(existing_tib, new_tib) {
  print("Combining with master tibble...")
  # tibble, tibble --> tibble
  # running stats, current stats --> updated running stats


  updated_tib <- bind_rows(existing_tib, new_tib) %>%

    group_by(author) %>%
    summarize(frequency = sum(frequency),
              total_num_pages = sum(total_num_pages)                     # update number of total pages author has across all books
    ) %>%
    arrange(desc(frequency), desc(total_num_pages))


  print("Master tibble updated.")
  return(updated_tib)
}

Iterate over list of HIERARCHY files which represent the table of contents hierarchy/structure. Get master author_stats table and calculate author proportion of total pages.

In [None]:
get_master_tibble <- function(filepath_vector) {
  #vector -> tibble
  #list of data files -> master table

  print("Building master table...")

  master_table <- tibble()
  grand_total_pages <- 0


  for (file_path in filepath_vector) {

    print(paste("Now Processing:", str_sub(file_path, -30)))

    current_data_tib <- load_and_clean(file_path)
    grand_total_pages <- grand_total_pages + max(current_data_tib$`Page Number`)
    print(paste("Grand total Pages: ", grand_total_pages))
    current_author_stats <- get_author_stats(current_data_tib)

    master_table <- combine_tibs(master_table, current_author_stats)
  }

  print("Master table constructed; Performing final proportion calculations...")

  master_table <- master_table %>%
    mutate(proportion = round(total_num_pages / grand_total_pages, 4)) %>%
    select(author, frequency, total_num_pages, proportion) %>%
    arrange(desc(frequency), desc(proportion))

  print("Master table complete.")
  return(master_table)

}

Get filepath list from main DATA_FOLDER containing data files. Call functions to get master author_stats table and save it if specified

In [None]:
get_master_table_from_directory <- function(directory, outfile_name, savetxt=TRUE){

  # Get a character vector of all files ending in .txt in the "data" folder
  print("Getting list of all files...")
  all_files <- list.files(path=directory,
                          pattern = "\\HIERARCHY.txt$",
                          full.names = TRUE)

  master_tibble <- get_master_tibble(all_files)

  View(master_tibble)



  if (savetxt) {
    print("Saving master table as .txt file...")
    write_delim(master_tibble,
                file=OUTFILE,
                delim = "\t")

    print(paste(OUTFILE, " saved."))
  }


  return(master_tibble)

}

The "Main" function for this process

In [None]:
auth_freq_totalpgs <- function(){
  get_master_table_from_directory(DATA_FOLDER, OUTFILE, savetxt=TRUE)
}

In [None]:
auth_freq_totalpgs()

###Get Data for Racing Bar Chart
Output: `Year`, `Author`, `Cum_Num_Pgs`

Call get_author_stats to get table with `author`, `num_pages`, and `frequency`; add field of anthology publish `year` from METADATA files.

In [None]:
get_total_pages <- function(hier_files, meta_files){

  master <- tibble()
  # For each HIER file: get distinct author, total_num_pages, and add publish year
  # Append to master table
  for (i in seq_along(hier_files)){
    curr_hier <- load_and_clean(hier_files[i])
    curr_meta <- read_delim(meta_files[i], delim='\t')

    total_pages <- get_author_stats(curr_hier) %>%
      mutate(year = curr_meta$`Publish Date`) %>%
      select(year, author, total_num_pages)

    master <- bind_rows(master, total_pages)
  }

  # Case of two anthologies published in same year:
  # For each year get distinct author and sum(total_num_pages)
  master <- master %>%
    group_by(year, author) %>%
    summarize(total_num_pages = sum(total_num_pages))


  return(master)

}

Get list of HIERARCHY and METADATA files to pass to `get_total_pages` function. Create master table with `author`, `num_pages`, `frequency`, and publish `year`. Find and add field  `cum_num_pages` for cumulative number of pages for racing bar chart data.

In [None]:
racing_bar_main <- function(){
  data_folders <- sort(dir(path = DIR_PATH,
                           pattern = FOLDER_PATTERN,
                           full.names = TRUE))

  master <- tibble()
  #for each century
  for (folder in data_folders){
    print(folder)
    hier_files <- sort(list.files(path = folder,
                                  pattern = "\\HIERARCHY.txt$",
                                  full.names = TRUE))
    meta_files <- sort(list.files(path = folder,
                                  pattern = "\\METADATA.txt$",
                                   full.names = TRUE))
    curr_total_pages <- get_total_pages(hier_files, meta_files)

    master <- bind_rows(master, curr_total_pages)
  }

  # Sort by year, Get cumulative number of pages for each author
  cum_pages <- master %>%
    group_by(author) %>%
    arrange(year) %>%
    mutate(cum_pages = cumsum(total_num_pages)) %>%
    select(year, author, cum_pages)

  View(cum_pages)
  return(cum_pages)

}

In [None]:
a <- racing_bar_main() %>%
    pivot_wider(names_from = year,
              values_from = cum_pages,
              values_fill = NULL)

In [None]:
write_tsv(b, "flourish1.tsv")