In [1]:
# Load necessary libraries
library(readr)
library(dplyr)
library(tidytext)
library(tm)
library(topicmodels)
library(ggplot2)
library(parallel) 
library(slam) 
library(Rmpfr)  
library(reshape2) 
library(scales) 
library(grid)
library(ldatuning)
library(textmineR)
library(SnowballC)
library(textstem)
library(textclean)
library(stringr)
library(qdapDictionaries)
library(lexicon)
library(here)
library(lubridate)
library(text2vec)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: NLP


Attaching package: ‘ggplot2’


The following object is masked from ‘package:NLP’:

    annotate


Loading required package: gmp


Attaching package: ‘gmp’


The following objects are masked from ‘package:base’:

    %*%, apply, crossprod, matrix, tcrossprod


C code of R package 'Rmpfr': GMP using 64 bits per limb



Attaching package: ‘Rmpfr’


The following object is masked from ‘package:gmp’:

    outer


The following objects are masked from ‘package:stats’:

    dbinom, dgamma, dnbinom, dnorm, dpois, dt, pnorm


The following objects are masked from ‘package:base’:

    cbind, pmax, pmin, rbind



Attaching package: ‘scales’


The following object is masked from ‘package:readr’:

    col_factor


Loading required package: Matrix


Attaching package: ‘t

In [2]:
i_am("here.txt")
here()

here() starts at /home/ec2-user/SageMaker/esa_analysis



In [3]:
clean_corpus <- read.csv(
    here("data", "processed", "clean_text_corpus.csv"))

str(clean_corpus)

'data.frame':	5239 obs. of  6 variables:
 $ X.1  : int  1 2 3 4 5 6 7 8 9 10 ...
 $ X    : int  0 1 2 3 4 5 6 7 8 9 ...
 $ GOID : num  4.22e+08 4.09e+08 4.29e+08 4.19e+08 2.07e+09 ...
 $ Title: chr  "Developers Urge Delay of Gnatcatcher Ruling; Birds: Species isn't in danger, they claim. Federal decision this "| __truncated__ "TODAY IN CONGRESS" "Plan Offered to Aid Northwest Salmon and Trout" "SAVING WILDLIFE" ...
 $ Date : chr  "2000-09-26" "1999-07-20" "1994-03-27" "1999-07-05" ...
 $ Text : chr  "arm study developer demand federal official postpone decision due week provide acre critical habitat tiny gnatc"| __truncated__ "senate meet a.m. committee arm service a.m. u.s. policy military operation kosovo defense sec william cohen joi"| __truncated__ "ap clinton administration propose protection zone river stream threaten fish species federal land eastern orego"| __truncated__ "tribune june news item recovery propose removal bald eagle endanger species list mention eagle extinction m

In [4]:
# Create DTM using TF-IDF
tokens <- word_tokenizer(clean_corpus$Text)
it <- itoken(tokens, ids = clean_corpus$GOID, progressbar = FALSE)
vocab <- create_vocabulary(it)
vectorizer <- vocab_vectorizer(vocab)
dtm <- create_dtm(it, vectorizer)

# Use TF-IDF
tfidf <- TfIdf$new()
dtm_tfidf <- fit_transform(dtm, tfidf)


In [5]:
dim(dtm_tfidf)

In [6]:
# Compute cosine similarity
cosine_sim <- sim2(dtm_tfidf, method = "cosine", norm = "l2")

In [7]:
str(cosine_sim)
dim(cosine_sim)

Formal class 'dsCMatrix' [package "Matrix"] with 7 slots
  ..@ i       : int [1:13726180] 0 0 1 0 1 2 0 1 2 3 ...
  ..@ p       : int [1:5240] 0 1 3 6 10 15 21 28 36 45 ...
  ..@ Dim     : int [1:2] 5239 5239
  ..@ Dimnames:List of 2
  .. ..$ : chr [1:5239] "421721236" "408507519" "429461998" "418798522" ...
  .. ..$ : chr [1:5239] "421721236" "408507519" "429461998" "418798522" ...
  ..@ x       : num [1:13726180] 1 0.013 1 0.0361 0.0262 ...
  ..@ uplo    : chr "U"
  ..@ factors : list()


In [8]:
saveRDS(
    cosine_sim,
    here("data", "processed", "whole_cosine_sim.rds")
)

In [9]:
# Identify syndicated articles based on threshold
similarity_threshold <- 0.95

# Identify syndicated articles based on the threshold
pairs <- expand.grid(Article1 = clean_corpus$GOID, Article2 = clean_corpus$GOID) %>%
  filter(Article1 < Article2) %>%  # Ensure unique pairs and exclude self-comparisons
  mutate(
    Similarity = cosine_sim[cbind(
      match(Article1, clean_corpus$GOID),
      match(Article2, clean_corpus$GOID)
    )]
  ) %>%
  filter(Similarity >= similarity_threshold)


In [10]:
str(pairs)
head(pairs, n = 10)

'data.frame':	233 obs. of  3 variables:
 $ Article1  : num  2.10e+09 4.21e+08 4.30e+08 7.59e+08 4.21e+08 ...
 $ Article2  : num  2.10e+09 4.21e+08 4.30e+08 7.64e+08 4.21e+08 ...
 $ Similarity: num  0.972 0.992 1 0.983 0.985 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:2] 5239 5239
  .. ..- attr(*, "names")= chr [1:2] "Article1" "Article2"
  ..$ dimnames:List of 2
  .. ..$ Article1: chr [1:5239] "Article1= 421721236" "Article1= 408507519" "Article1= 429461998" "Article1= 418798522" ...
  .. ..$ Article2: chr [1:5239] "Article2= 421721236" "Article2= 408507519" "Article2= 429461998" "Article2= 418798522" ...


Unnamed: 0_level_0,Article1,Article2,Similarity
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>
1,2100798323,2100798380,0.9720786
2,421314883,421347484,0.9923435
3,430332811,430333805,1.0
4,758820654,763701373,0.9828463
5,421472896,421479391,0.9852781
6,2630547628,2630564268,0.9547288
7,420415753,422099712,0.9509504
8,293195609,293262146,0.9933331
9,421470906,421476020,0.9616118
10,281820535,421131882,0.9770187


In [11]:
# Initialize group tracking
pairs2 <- pairs %>%
  mutate(Group = NA_integer_)

head(pairs2)

Unnamed: 0_level_0,Article1,Article2,Similarity,Group
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<int>
1,2100798323,2100798380,0.9720786,
2,421314883,421347484,0.9923435,
3,430332811,430333805,1.0,
4,758820654,763701373,0.9828463,
5,421472896,421479391,0.9852781,
6,2630547628,2630564268,0.9547288,


In [12]:
group_id <- 1

# Assign groups iteratively
while (any(is.na(pairs2$Group))) {
  # Get the first ungrouped pair
  ungrouped <- pairs2 %>% filter(is.na(Group)) %>% slice(1)
  current_group <- c(ungrouped$Article1, ungrouped$Article2)
  
  # Expand the group to include all connected articles
  repeat {
    new_members <- pairs2 %>%
      filter(is.na(Group) & (Article1 %in% current_group | Article2 %in% current_group)) %>%
      select(Article1, Article2) %>%
      unlist() %>%
      unique()
    
    if (all(new_members %in% current_group)) break  # No new members
    current_group <- unique(c(current_group, new_members))
  }
  
  # Assign group ID to all pairs in this group
  pairs2 <- pairs2 %>%
    mutate(Group = ifelse(Article1 %in% current_group | Article2 %in% current_group, group_id, Group))
  
  group_id <- group_id + 1  # Increment group ID
}

In [13]:
str(pairs2)
head(pairs2)
range(pairs2$Group)

'data.frame':	233 obs. of  4 variables:
 $ Article1  : num  2.10e+09 4.21e+08 4.30e+08 7.59e+08 4.21e+08 ...
 $ Article2  : num  2.10e+09 4.21e+08 4.30e+08 7.64e+08 4.21e+08 ...
 $ Similarity: num  0.972 0.992 1 0.983 0.985 ...
 $ Group     : num  1 2 3 4 5 6 7 8 9 10 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:2] 5239 5239
  .. ..- attr(*, "names")= chr [1:2] "Article1" "Article2"
  ..$ dimnames:List of 2
  .. ..$ Article1: chr [1:5239] "Article1= 421721236" "Article1= 408507519" "Article1= 429461998" "Article1= 418798522" ...
  .. ..$ Article2: chr [1:5239] "Article2= 421721236" "Article2= 408507519" "Article2= 429461998" "Article2= 418798522" ...


Unnamed: 0_level_0,Article1,Article2,Similarity,Group
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
1,2100798323,2100798380,0.9720786,1
2,421314883,421347484,0.9923435,2
3,430332811,430333805,1.0,3
4,758820654,763701373,0.9828463,4
5,421472896,421479391,0.9852781,5
6,2630547628,2630564268,0.9547288,6


In [14]:
group1 <- pairs2 %>% 
    select("GOID" = Article1, Group) %>% 
    distinct()

str(group1)
head(group1)

'data.frame':	184 obs. of  2 variables:
 $ GOID : num  2.10e+09 4.21e+08 4.30e+08 7.59e+08 4.21e+08 ...
 $ Group: num  1 2 3 4 5 6 7 8 9 10 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:2] 5239 5239
  .. ..- attr(*, "names")= chr [1:2] "Article1" "Article2"
  ..$ dimnames:List of 2
  .. ..$ Article1: chr [1:5239] "Article1= 421721236" "Article1= 408507519" "Article1= 429461998" "Article1= 418798522" ...
  .. ..$ Article2: chr [1:5239] "Article2= 421721236" "Article2= 408507519" "Article2= 429461998" "Article2= 418798522" ...


Unnamed: 0_level_0,GOID,Group
Unnamed: 0_level_1,<dbl>,<dbl>
1,2100798323,1
2,421314883,2
3,430332811,3
4,758820654,4
5,421472896,5
6,2630547628,6


In [15]:
group2 <- pairs2 %>% 
    select("GOID" = Article2, Group) %>% 
    distinct()

str(group2)
head(group2)

'data.frame':	185 obs. of  2 variables:
 $ GOID : num  2.10e+09 4.21e+08 4.30e+08 7.64e+08 4.21e+08 ...
 $ Group: num  1 2 3 4 5 6 7 8 9 10 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:2] 5239 5239
  .. ..- attr(*, "names")= chr [1:2] "Article1" "Article2"
  ..$ dimnames:List of 2
  .. ..$ Article1: chr [1:5239] "Article1= 421721236" "Article1= 408507519" "Article1= 429461998" "Article1= 418798522" ...
  .. ..$ Article2: chr [1:5239] "Article2= 421721236" "Article2= 408507519" "Article2= 429461998" "Article2= 418798522" ...


Unnamed: 0_level_0,GOID,Group
Unnamed: 0_level_1,<dbl>,<dbl>
1,2100798380,1
2,421347484,2
3,430333805,3
4,763701373,4
5,421479391,5
6,2630564268,6


In [16]:
syndicated_groups <- rbind(group2, group1) %>% distinct()

str(syndicated_groups)
head(syndicated_groups)

'data.frame':	351 obs. of  2 variables:
 $ GOID : num  2.10e+09 4.21e+08 4.30e+08 7.64e+08 4.21e+08 ...
 $ Group: num  1 2 3 4 5 6 7 8 9 10 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:2] 5239 5239
  .. ..- attr(*, "names")= chr [1:2] "Article1" "Article2"
  ..$ dimnames:List of 2
  .. ..$ Article1: chr [1:5239] "Article1= 421721236" "Article1= 408507519" "Article1= 429461998" "Article1= 418798522" ...
  .. ..$ Article2: chr [1:5239] "Article2= 421721236" "Article2= 408507519" "Article2= 429461998" "Article2= 418798522" ...


Unnamed: 0_level_0,GOID,Group
Unnamed: 0_level_1,<dbl>,<dbl>
1,2100798380,1
2,421347484,2
3,430333805,3
4,763701373,4
5,421479391,5
6,2630564268,6


In [17]:
count <- syndicated_groups %>% 
    count(Group) %>% 
    arrange(desc(n))

str(count)
head(count)

'data.frame':	166 obs. of  2 variables:
 $ Group: num  12 30 18 19 24 27 74 83 100 1 ...
 $ n    : int  12 4 3 3 3 3 3 3 3 2 ...
 - attr(*, "out.attrs")=List of 2
  ..$ dim     : Named int [1:2] 5239 5239
  .. ..- attr(*, "names")= chr [1:2] "Article1" "Article2"
  ..$ dimnames:List of 2
  .. ..$ Article1: chr [1:5239] "Article1= 421721236" "Article1= 408507519" "Article1= 429461998" "Article1= 418798522" ...
  .. ..$ Article2: chr [1:5239] "Article2= 421721236" "Article2= 408507519" "Article2= 429461998" "Article2= 418798522" ...


Unnamed: 0_level_0,Group,n
Unnamed: 0_level_1,<dbl>,<int>
1,12,12
2,30,4
3,18,3
4,19,3
5,24,3
6,27,3


In [18]:
group_check <- syndicated_groups %>% 
    filter(Group == "12")

group_check

GOID,Group
<dbl>,<dbl>
409464531,12
409520608,12
409870361,12
409874944,12
409345592,12
409301278,12
409592729,12
409559555,12
409298559,12
409323055,12


In [19]:
# Get syndicated and unique articles
syndicated_ids <- syndicated_groups$GOID

# Syndicated articles
syndicated_articles <- clean_corpus %>%
  filter(GOID %in% syndicated_ids) %>% 
  inner_join(syndicated_groups, by = "GOID")

In [20]:
str(syndicated_ids)
str(syndicated_articles)

 num [1:351] 2.10e+09 4.21e+08 4.30e+08 7.64e+08 4.21e+08 ...
'data.frame':	351 obs. of  7 variables:
 $ X.1  : int  8 21 40 42 48 57 60 87 108 115 ...
 $ X    : int  7 20 39 41 47 56 59 86 107 114 ...
 $ GOID : num  2.10e+09 4.21e+08 4.21e+08 1.81e+09 1.17e+09 ...
 $ Title: chr  "Legal Notice 2 -- No Title" "EDUCATION: SMART RESOURCES FOR STUDENTS AND PARENTS; Where Learning and the Internet Meet: LAUNCH POINT; Endangered Species" "VENTURA COUNTY NEWS; Groups Join to Push Steelhead Safeguards" "Island foxes taken off endangered species list" ...
 $ Date : chr  "1998-01-05" "1998-11-02" "2000-02-15" "2016-08-13" ...
 $ Text : chr  "legal notice service bid want request proposal california tern colony monitor study harbor department city los "| __truncated__ "high rate plant animal extinction alarm unite state species endanger threaten due habitat loss pollution introd"| __truncated__ "unusual coalition interest announce monday band press great protection steelhead trout fast disappear 

In [21]:
write.csv(
    syndicated_articles,
    here("data", "processed", "syndicated_articles.csv")
)

In [22]:
# Keep only the earliest article from each group, resolving ties by Article_ID
earliest_syndicated <- syndicated_articles %>%
  group_by(Group) %>%
  slice_min(order_by = Date, with_ties = TRUE) %>% # Keep ties for the same date
  slice_min(order_by = GOID, n = 1) %>%      # Pick one article (arbitrarily by ID)
  ungroup()

In [23]:
str(earliest_syndicated)

tibble [166 × 7] (S3: tbl_df/tbl/data.frame)
 $ X.1  : int [1:166] 659 135 797 5127 1030 3398 4336 251 3401 2113 ...
 $ X    : int [1:166] 658 134 796 5128 1029 3399 4337 250 3402 2113 ...
 $ GOID : num [1:166] 2.10e+09 4.21e+08 4.30e+08 7.59e+08 4.21e+08 ...
 $ Title: chr [1:166] "Legal Notice 1 -- No Title" "EDUCATION / An Exploration of Ideas, Issues and Trends in Education; Where Learning and the Internet Meet: LAUN"| __truncated__ "Moderates Soften G.O.P. Agenda on Environment" "A pack of lawmakers goes hunting; New legislation would keep the resurgent gray wolf off the federal endangered species list." ...
 $ Date : chr [1:166] "1998-01-05" "1998-11-02" "1995-10-24" "2010-10-18" ...
 $ Text : chr [1:166] "legal notice service bid want request proposal california tern colony monitor study harbor department city los "| __truncated__ "high rate plant animal extinction alarm unite state species endanger threaten due habitat loss pollution introd"| __truncated__ "tacit support party l

In [24]:
earliest_syndicated2 <- earliest_syndicated %>% 
    select(-Group)

str(earliest_syndicated2)

tibble [166 × 6] (S3: tbl_df/tbl/data.frame)
 $ X.1  : int [1:166] 659 135 797 5127 1030 3398 4336 251 3401 2113 ...
 $ X    : int [1:166] 658 134 796 5128 1029 3399 4337 250 3402 2113 ...
 $ GOID : num [1:166] 2.10e+09 4.21e+08 4.30e+08 7.59e+08 4.21e+08 ...
 $ Title: chr [1:166] "Legal Notice 1 -- No Title" "EDUCATION / An Exploration of Ideas, Issues and Trends in Education; Where Learning and the Internet Meet: LAUN"| __truncated__ "Moderates Soften G.O.P. Agenda on Environment" "A pack of lawmakers goes hunting; New legislation would keep the resurgent gray wolf off the federal endangered species list." ...
 $ Date : chr [1:166] "1998-01-05" "1998-11-02" "1995-10-24" "2010-10-18" ...
 $ Text : chr [1:166] "legal notice service bid want request proposal california tern colony monitor study harbor department city los "| __truncated__ "high rate plant animal extinction alarm unite state species endanger threaten due habitat loss pollution introd"| __truncated__ "tacit support party l

In [25]:
# Unique articles
unique_articles <- clean_corpus %>%
  filter(!GOID %in% syndicated_ids)

str(unique_articles)

'data.frame':	4888 obs. of  6 variables:
 $ X.1  : int  1 2 3 4 5 6 7 9 10 11 ...
 $ X    : int  0 1 2 3 4 5 6 8 9 10 ...
 $ GOID : num  4.22e+08 4.09e+08 4.29e+08 4.19e+08 2.07e+09 ...
 $ Title: chr  "Developers Urge Delay of Gnatcatcher Ruling; Birds: Species isn't in danger, they claim. Federal decision this "| __truncated__ "TODAY IN CONGRESS" "Plan Offered to Aid Northwest Salmon and Trout" "SAVING WILDLIFE" ...
 $ Date : chr  "2000-09-26" "1999-07-20" "1994-03-27" "1999-07-05" ...
 $ Text : chr  "arm study developer demand federal official postpone decision due week provide acre critical habitat tiny gnatc"| __truncated__ "senate meet a.m. committee arm service a.m. u.s. policy military operation kosovo defense sec william cohen joi"| __truncated__ "ap clinton administration propose protection zone river stream threaten fish species federal land eastern orego"| __truncated__ "tribune june news item recovery propose removal bald eagle endanger species list mention eagle extinction

In [26]:
# Combine unique articles with earliest syndicated articles
final_corpus <- bind_rows(unique_articles, earliest_syndicated2) %>%
  arrange(Date) %>% 
  select(GOID, Title, Date, Text)

str(final_corpus)
head(final_corpus)

'data.frame':	5054 obs. of  4 variables:
 $ GOID : num  4.24e+08 4.24e+08 4.24e+08 4.24e+08 4.24e+08 ...
 $ Title: chr  "BY TOM FERRELL AND MARGOT SLADE; IDEAS AND TRENDS; Pope's Clear Call To Serve Men, Not Their Systems" "In Legal Limbo" "THE CASE FOR THE BITTERING AND BRONX RIVER" "MORE SNAIL DARTERS FOUND IN TENNESSEE" ...
 $ Date : chr  "1980-07-13" "1980-09-07" "1980-10-19" "1980-11-08" ...
 $ Text : chr  "universal bishop roman catholic church pope john paul ii call fulfil preach office bishop end week tour brazil "| __truncated__ "case donald lang murder question legally mr lang unable speak hear read write find guilty chicago kill prostitu"| __truncated__ "bronx river survive ravage time assault civilization year hot dry summer record threaten danger channelization "| __truncated__ "believe population snail darter inch fish demise lead supreme court halt construction tellico dam tennessee yea"| __truncated__ ...


Unnamed: 0_level_0,GOID,Title,Date,Text
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>
1,423968056,"BY TOM FERRELL AND MARGOT SLADE; IDEAS AND TRENDS; Pope's Clear Call To Serve Men, Not Their Systems",1980-07-13,universal bishop roman catholic church pope john paul ii call fulfil preach office bishop end week tour brazil clear mindful duty skillful contrast impression muddle leave mind auditor pope visit mexico year message brazil clear social justice christian church commend political system long inherently unjust bishop priest seek political power addition think clergy teach catholic faith address brazil bishop eucharistic congress fortaleza thursday pope remind vocation prohibit directly total clarity take part appear politically partisan subject ideology system continue theme strike previous talk prohibit invite close service man disable socially active brazilian church feel strongly endorse brazil activist bishop page account pope find clarity people practice listen thing clerical traffic brazil rome recently consider bishop argue successfully clarification definition social issue confuse puebla pope experience adopt coherent straightforward development view rarely vague theologian order avoid misunderstand member political movement press cool capacity n.r.c. aftermath mile island popular prescription ill nuclear regulatory commission head chairman joseph m hendrie lose demote commissioner december interim chairman name lot people wonder willingly lead agency beleaguer critic internal dissension answer week president carter nominate albert carnesale nuclear engineer professor public policy harvard john f kennedy school government replace outgoing commissioner richard kennedy senate approve nomination expect name chairman commission mr carter professor carnesale provide leadership agency sensitivity overriding safety environmental interest development nuclear power colleague professor bring arbitrator eye controversy skill handy member agency remain divide fast industry expand sensitive accusation favor commercial interest mindful president shake organization plan vest strong executive authority commission chairman legislatively sidetrack month mr carter recently sign law harsh penalty violation federal safety standard plant operation authorization inspector order agency establish strict train program inspector crowd control federal bureau standard produce report write safety code nationwide people public place emergency week city cincinnati call federal agency auditorium operator press issue public place safely cincinnati appeal contain report prepare city council death person rush enter rock concert december city ban festival seating seat general admission stand area general admission seat public event report make effort fix blame death substantial recommendation study manage incoming crowd short term report recommend cincinnati ban general admission relax case seat trouble tennis match high school graduation emphasize necessity study inbound crowd control people crush death week enter soccer stadium brazil pope john paul ii hope diabetic cell transplant daily shot insulin life million nation million diabetic body unable produce hormone help metabolize sugar carbohydrate week researcher washington university medical school announce late success treatment eventually equip diabetic cell need produce insulin key ability transplant islet insulin produce cell cluster pancreas healthy animal diabetic human prompt recipient immune system reject foreign matter human donor impractical small quantity cell extract human pancreas dr paul e lacy research group transfer islet species rat diabetic mouse inject blood lodge function live mouse cell produce insulin maintain normal blood sugar level day experiment dr lacy involve pig mouse transplant important step pig source islet human central success rat mouse transfer researcher week long incubation donor cell room temperature render unrecognizable foreign tissue cell incubate body temperature dr lacy believe deception involve white blood cell accompany transplant islet trigger immune response oil hunt allow beaufort sea legal icejam block sale oil gas drill right beaufort sea alaska north slope break week federal appeal court overturn injunction rule government issue drill lease oil company submit million lease bid january unite state district judge block acceptance basis suit call interior secretary cecil d andrus meet trust responsibility alaskan native comply endanger species act national environmental policy act inupiat eskimo north slope contend drill endanger bowhead whale mainstay hunt life environmental group argue government proper lease procedure assess threat whale population alaska supreme court approve pre drill activity region state own land oil company begin biological environmental study impact drill seismic test map sea floor locate deposit sample drill environmentalist hope stay court order memory lane elephant woolly mammoth ice age look elephant walk elephant expert fact relate present day elephant week scientist announce biochemical test establish mammoth ancestor indian african elephant proof kinship university california researcher isolate protein albumin carcass baby mammoth find freeze remarkably preserve siberia year ago inject solution material rabbit scientist find antibody produce response rabbits immune system reacted strongly similar protein elephant species imply genetic connection similar protein comparison perform live species test yield conclusion gorilla chimpanzee closely relate human scientist success extract genetic information fossil remain extinct creature idea trend page tom ferrell margot slade photo victim rock concert stampede
2,423987935,In Legal Limbo,1980-09-07,case donald lang murder question legally mr lang unable speak hear read write find guilty chicago kill prostitute sentence year prison court throw conviction find trial unfair cooperate defense proof mr lang mentally november mr lang legal limbo cook county jail prosecutor office state attorney note stand trial fit hold sick house mirror turn exit mr lang cook county jail effort teach sign language work henry hauser deputy chief civil division state attorney publicity book dummy television movie base mr lang plight case remain unresolved murder indictment stand week state prepare file legal response illinois supreme court rule mr lang entitle hear fit stand trial position lang unfit trial mr hauser add state commit mental rehabilitation fight forgery promise instrument shape ballpoint pen patent combat forgery instrument produce electrical signal handwriting motion compare signal handwriting file computer patent invent staff member unite state energy department sandia laboratory albuquerque n.m. measure vertical horizontal motion handwriting patent invent researcher international business machine yorktown height n.y. measure vertical horizontal handwriting motion characteristic acceleration pressure invention make market robert j marchick chief energy department patent license branch germantown md indication interest numb country firm country request manufacture license free i.b.m. device field test good dr chao n liu inventor development grizzly affair people meet attack grizzly hand topaw combat survive talk ed wiseman hunt guide crestone colo september female bear rush provocation hunt stab throat arrow kill month leave hospital mr wiseman cloud unite state fish wildlife service doubt story autopsy find bear kill blade enter shoulder grizzly protect endanger species act march mr wiseman face year prison fine convict kill unjustifiably drop special agent john r griest case fish wildlife service office denver hunter lie detector test clean mr wiseman wild week elk hunt limp result grizzly battle damage nerve foot wife vivian mass murder sea allege sea smuggler old ploy record destroy cargo destroy evidence convict cargo human january jeffrey r hastings hypluxo fla james h knowles tarpon bay bahamas await trial degree murder accuse throw child overboard rough sea force adult jump gunpoint boat illegal haitian alien catch police spotlight west palm beach fla august child mother drown defendant prison mr knowles serve sentence mr hastings term manslaughter conviction richard haitch
3,423991776,THE CASE FOR THE BITTERING AND BRONX RIVER,1980-10-19,bronx river survive ravage time assault civilization year hot dry summer record threaten danger channelization face prospect lose identity river transform storm drainage ditch concrete canal summer bronx river quietly make history tiny fish discover bronxville reach river north america rare fish bitterling rhodeus seriaceus inch long easily mistake freshwater minnow bitterling exotic species introduce europe release sawmill river early live sawmill mysteriously disappear turn briefly bronx river spring bitterlings river bronxville midland avenue bridge boy scout field half mile upstream paxton avenue fall fuss tiny fish zoologist story bronx river bitterling way similar saga famous snail darter inch fish live tennessee river area giant tellico dam build snail darter symbol standard environmentalist convince dam million boondoggle political porkbarrel stop construction invoke endanger species act grind dam destroy breed place snail darter extermination endanger species act approve congress protect maintain endanger plant animal preserve environment set court construction tellico dam resume army corp engineer tellico dam dispute bitter battle fight environmentalist tiny snail darter small part issue endanger species act compromise corp engineer back work dam defeat darter lesson learn tellico tragedy defender life bronx river bitterling tiny fish tower tree matter close extinction match powerful army corp engineer flood control proposal plan include channelizing bronx river tuckahoe bronxville east yonkers east street bronx distance half mile estimate cost million double time construction federal regulation require percent construction cost bear state locality channelization proposal carry breed ground bitterling destroy rare endanger fish join grow list extinct species people property endanger flood part river bitterlings difficult understand justify reason channelization point snail darter unknown science discover river construction dam start fact weaken effectiveness deterrent build dam sharp contrast bitterling year existence make month corp engineer schedule complete final report recommendation flood control bronx river valley present public meet scarsdale audubon society forefront campaign preserve maintain bronx river natural fauna flora february society organize symposium april field trip river tuckahoe bronxville inform citizen south central westchester flood control plan contemplate corp corp hold public meet alternative flood control plan bronx river basin tuckahoe bronxville east yonkers present discuss meet scarsdale audubon society propose priority nonstructural procedure include enforcement exist land regulation flood forecast flood insurance purchase land subject flood channelization river tuckahoe street oppose effective costly generate great ecological river bank inhabitant audubon support recommendation fish wildlife service flood wall levee consider build back river natural flood plain river bank maintain minimize ecological disruption disturbance society recommend comprehensive study effect flood control proposal entire river bronxville tuckahoe area midpoint river inevitably create flood drainage problem upstream downstream face endanger fish live endanger river possibility lose flood control channelization convert sparkle stream concrete storm drainage ditch prove effectiveness flood control technique channelization encourage control flood knowingly willfully kill death endanger species contrary spirit intent endanger species act unlawful inhumane estimate end century million species plant animal wipe human pollution air water soil destruction forest waterway deliberately add victim holocaust unnecessary channelization act kindness wisdom finally bronx river pride joy westchester close thousand plant identify live bank equal numb animal call home people thousand run walk cycle drive enjoy beauty real flow live river convert drainage ditch canal irretrievable loss unpardonable sin scarsdale audubon society respectfully request army corp engineer deliberation final recommendation inflationary cost questionable benefit channelization adopt report audubon society edward frankel vice president scarsdale audubon society draw fish
4,424019341,MORE SNAIL DARTERS FOUND IN TENNESSEE,1980-11-08,believe population snail darter inch fish demise lead supreme court halt construction tellico dam tennessee year find alive previously unknown habitat mile dam dr david etnier university tennessee zoology professor discover population south chickamauga creek weekend previously discover identify fish tennessee river tellico dam site dr etnier believe newly find fish species constitute previously undiscovered group live creek year dam site tennessee river branch tennessee think snail darter natural habitat year tennessee valley authority biologist transplant snail darter species darter tennessee river network site elk holston hiwassee river dr etnier specimen find year site appear hiwassee river population successfully reproduce migration group migrate transplant site difficult trip migrate time t.v.a. report dr etnier authority biologist find darter creek federal law protect darter endanger species work tellico dam halt supreme court rule dam gate close threaten darters natural habitat existence congress grant exemption law year million dam complete construction complete dam gate close form lake south chickamauga creek include legal exemption tellico dam newly discover fish protect endanger species act
5,424018521,HUNTING THE ILLEGAL HUNTERS,1980-11-09,man yorktown wood man wait environmental conservation officer ask hunt knife cover blood search hunt pouch man carry find plastic bag deer liver wood discover body fawn man arrest charge take deer shotgun close season judge yorktown fine conservation officer enforcement arm york state department environmental conservation robert van benschotten director department division law enforcement enforce fish wildlife law law regulate forestry clean air water toxic solid waste wetland stream navigation endanger species division conservation officer include woman lieutenant captain officer assign westchester county divide patrol area richard conway rye cover southern westchester include marine district coast water long island sound port chester city island myles schillinger bedford patrol northeast sector county steve cook ossining handle complaint violation northwest section county conservation officer style fish game cop percent time devote hunt fish enforcement rest environmental conservation law lieut harry saglibene thornwood conservation official year supervise officer westchester rockland putnam orange ulster county westchester officer catch variety poacher track skindivers pilfer lobster trap sound clammers take contaminate shellfish shore spring big stripe bass business operate illegally hudson river work law hard enforce legal catch stripe bass high level poisonous polychlorinated biphenyl pcb officer catch fisherman sell fish make arrest find undersized trout hide fisherman hip wader illegally kill songbird hide hubcap car price meat fish escalate poach officer schillinger enforce endanger species act officer seize wallaby yonkers pet store day stakeout officer recently catch man illegally dump content septictank truck wood area north salem protect county stream wetland check truck county major highway monitor illegal transportation toxic solid waste trace beginning game protectors add fish wildlife commission mr van benschotten game protector originally work socalled moiety system receive half fine levy convict poacher court half moiety system disappear job title game protector conservation officer environmental conservation officer officer select list candidate pass civil service examination physical agility test receive law enforcement train york state police academy albany police officer lieutenant saglibene statewide jurisdiction state police mean issue speed ticket hopeful pass test position environmental conservation officer job westchester fill vacancy problem rapid turnover high cost live area low salary position trainee earn year receive year location pay officer train westchester transfer area open problem conservation officer deal year large numb domestic dog run deer illegal dog run free land inhabit deer accompany owner officer cook dogs kill deer year actual count high owner dog biologists investigation time dog leave deer cripple eat hamstring tear stomach intestine leave deer alive dog catch chase deer police officer shoot diminish wildlife habitat westchester big population fish wild animal county build deer animal funnel remain wood area officer cook raccoon distemper greenburgh coyote hit car rye deer greenburgh county airport people talk deer overpopulation problem continue numb deer deer habitat reduce people live habitat deer property eat shrub photo myles schillinger
6,424019099,OIL DRILL STUDY MUDDIES WATERS FOR BOWHEAD WHALES OFF ALASKA,1980-11-13,inch thick million report documents government endanger bowhead whale spokesman eskimo whaling community jon buchholdt north slope borough add reason government bowhead whale refuse listen people eskimo continue century practice hunt whale part subsistence culture mr buchholdt recent interview refer report result month field work september november assess potential impact oil drill whale dispute lease area study make public month commission bureau land management alaska outer continental shelf office conclusion team whale spend significant time lease area scientist add study need eskimo interest bowhead whale hunt vital part life government interest huge marine mammal migrate frigid beaufort sea prime location oil gas exploration drill estimate population scientist estimate bowhead plentiful north pacific number endanger species act forbid government action jeopardize existence endanger threaten species destruction modification habitat connie wassink spokesman outer continental shelf office differ mr buchholdt assessment study native whaling captain consult intuitive knowledge whale find government solid scientific research base decision meld eskimo whaling wisdom scientifically obtain datum dozen study plan add drill noise disturb whale potential oil spill harm food supply bowhead whale grow length foot follow melt ice pack spring southeastern part bering sea north bering strait beaufort sea freeze prompt return trip mile area dispute oil lease dispute lease area stretch mile east west center roughly prudhoe bay site world rich oil field bowhead whale migrate inhabit lease area spring migration scientist base sighting whale fall migration total bowhead whale observe lease area bowhead whale lease area bowheads fee plankton strain seawater hairlike structure mouth baleen fear oil development oil spill damage whales ability fee slight extreme foul crude oil show mark decrease filter efficiency extreme foul study find eskimo environmental group wage state federal court battle offshore oil development succeed delay prohibit drill


In [27]:
write.csv(
    final_corpus,
    here("data", "processed", "clean_original_docs_corpus.csv")
)