In [72]:
import csv
import re

import bs4
import pandas as pd
import requests
import spacy
from spacy import displacy
from textacy import extract

nlp = spacy.load('en_core_web_sm')
import networkx as nx

from pypdf import PdfReader
import os
from spacy.matcher import Matcher
from spacy.tokens import Span

import urllib.request

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [83]:
# assign directory
directory = '../docs2'

parsed_text = ""

# iterate over files in
# that directory
# We will get a list of sentences that we will be processing
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        # creating a pdf file object
        pdfFileObj = open(f, 'rb')

        # creating a pdf reader object
        pdfReader = PdfReader(pdfFileObj)

        # printing number of pages in pdf file
        print(len(pdfReader.pages))

        # creating a page object
        for page in pdfReader.pages:
            # extracting text from page
            parsed_text = " ".join((parsed_text, str(page.extract_text())))

            # closing the pdf file object
        pdfFileObj.close()

sentences = [[i] for i in nlp(parsed_text).sents]

12


In [84]:
headers = ['sentence']
values = sentences
filename = 'doc_text.csv'
with open(filename, 'w',newline='',encoding="utf-8") as data:
    writer = csv.writer(data)
    writer.writerow(headers)
    writer.writerows(values)

In [104]:
sentences = pd.read_csv("doc_text_clean.csv")

In [87]:
def find_root_of_sentence(doc):
    root_token = None
    for token in doc:
        if token.dep_ == "ROOT":
            root_token = token
    return root_token

In [88]:
#In order to find verb phrases, we will need to compile regular expression-like patterns for the part of speech combinations of the words that make up the verb phrase. If we print out parts of speech of verb phrases of the two preceding sentences, are made of and have, we will see that the part of speech sequences are AUX, VERB, ADP, and AUX.

verb_patterns = [[{"POS": "AUX"}, {"POS": "VERB"},
                  {"POS": "ADP"}],
                 [{"POS": "AUX"}]]

In [89]:
# The contains_root function checks if a verb phrase contains the root of the sentence:

def contains_root(verb_phrase, root):
    vp_start = verb_phrase.start
    vp_end = verb_phrase.end
    if vp_start <= root.i <= vp_end:
        return True
    else:
        return False

In [90]:
# The get_verb_phrases function gets the verb phrases from a spaCy Doc object:

def get_verb_phrases(doc):
    root = find_root_of_sentence(doc)
    verb_phrases = extract.token_matches(doc, verb_patterns)
    new_vps = []
    for verb_phrase in verb_phrases:
        if contains_root(verb_phrase, root):
            new_vps.append(verb_phrase)
    return new_vps

In [91]:
# The find_noun_phrase function will look for noun phrases either on the left- or right-hand side of the main verb phrase:

def find_noun_phrase(verb_phrase, noun_phrases, side):
    for noun_phrase in noun_phrases:
        if side == "left" and noun_phrase.start < verb_phrase.start:
            return noun_phrase
        elif side == "right" and noun_phrase.start > verb_phrase.start:
            return noun_phrase

In [118]:
# The longer_verb_phrase function finds the longest verb phrase:

def longer_verb_phrase(verb_phrases):
    longest_length = 0
    longest_verb_phrase = None
    for verb_phrase in verb_phrases:
        if len(verb_phrase) > longest_length:
            longest_verb_phrase = verb_phrase
    return longest_verb_phrase

In [123]:
#The find_noun_phrase function will look for noun phrases either on the left- or right-hand side of the main verb phrase:

def find_triplet(in_sentence):
    doc = nlp(in_sentence)
    verb_phrases = get_verb_phrases(doc)
    noun_phrases = doc.noun_chunks
    verb_phrase = None
    if len(verb_phrases) > 1:
        verb_phrase = longer_verb_phrase(list(verb_phrases))
    elif len(verb_phrases) == 1:
        verb_phrase = verb_phrases[0]
    if verb_phrase:
        left_noun_phrase = find_noun_phrase(verb_phrase, noun_phrases, "left")
        right_noun_phrase = find_noun_phrase(verb_phrase, noun_phrases, "right")
        return (left_noun_phrase, verb_phrase,
                right_noun_phrase)
    return None, None, None

In [137]:
# We can now loop through our sentence list to find its relation triplets:

for sentence in sentences["sentence"]:
    (left_np, vp, right_np) = find_triplet(sentence)
    if left_np and vp and right_np:
        print(left_np, "\t", vp, "\t", right_np)

Widely dispersed fragmented populations 	 are 	 a challenge
The species 	 is 	 a very large terrestrial decapod
the
sizes 	 are 	 they
One major challenge 	 is 	 imperfect detection
Capture-mark-recapture(CMR) models 	 are 	 the gold standard
data 	 can 	 the challenges
the context 	 may range from 	 CMR data
Animal Conservation /C15/C15(2023
This 	 is 	 commercial and no modiﬁcations
Integrated models 	 can 	 such disparate data
Coconut crabs 	 are 	 the world ’s largest terrestrial arthropod
Coconut
crabs 	 are 	 natural predation
a result 	 been upgraded from 	 ‘Data De ﬁcient
Coconut crab populations 	 are 	 steep decline
Pemba 	 is 	 an oceanic island
The island 	 is 	 fertilesoil
Pemba 	 is dominated by 	 small-scale farming
Unguja 	 is 	 a Landbridge island
Survey periods 	 were 	 the early part
Sampling sites 	 were 	 identi ﬁed
each crab 	 was 	 a
bucket
lastingthe length 	 be 	 identi ﬁed
the Terms 	 are governed by 	 the applicable Creative Commons License
We 	 are 	 con ﬁde

In [138]:
# And this time excluding some verbs

for sentence in sentences["sentence"]:
    (left_np, vp, right_np) = find_triplet(sentence)
    if left_np and vp and right_np and not any(verb in vp.text for verb in ['is', 'are', 'was', 'be', 'were', 'have']):
        print(left_np, "\t", vp, "\t", right_np)

data 	 can 	 the challenges
the context 	 may range from 	 CMR data
Animal Conservation /C15/C15(2023
Integrated models 	 can 	 such disparate data
it 	 would 	 16 pre-
2018 surveys
An all-N-mixturemodel 	 would 	 the information
These numbers 	 might 	 population
Animal Conservation /C15/C15(2023
we 	 could 	 the number
the degree 	 could point to 	 protected subpopulations
Tourist lodges 	 may 	 local crab densities
the same mean parameters 	 may 	 ﬂate differences
thesefragmented populations 	 will 	 conser-
vation plans
