In [1]:
import requests 
from bs4 import BeautifulSoup

In [2]:
url = "https://www.nbr.org/publication/enhancing-clean-energy-cooperation-in-the-indo-pacific/"
response = requests.get(url)

In [3]:
soup=BeautifulSoup(response.content, "html.parser")

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Enhancing Clean Energy Cooperation in the Indo-Pacific | The National Bureau of Asian Research (NBR)
  </title>
  <link href="https://www.nbr.org/xmlrpc.php" rel="pingback"/>
  <link href="https://www.nbr.org/wp-content/themes/nbr-theme/build/css/main.css" rel="stylesheet" type="text/css"/>
  <link href="https://www.nbr.org/wp-content/themes/nbr-theme/style.css" media="screen" rel="stylesheet" type="text/css"/>
  <script crossorigin="anonymous" defer="" integrity="sha384-3yBLeJ4waqGSAf4A8pjZ13UF7GuhgbdKnBQvIp/TkWoXtQbtwjlIPNjkDRJ46UCn" src="https://pro.fontawesome.com/releases/v5.5.0/js/all.js">
  </script>
  <meta content="max-image-preview:large" name="robots"/>
  <link href="//code.jquery.com" rel="dns-prefetch"/>
  <

In [5]:
main_content = soup.find('main')
for element in main_content(['a', 'em']):
    element.decompose()

In [6]:
text_list = []
stop_text = "James Bowen is a Policy Fellow at the Perth USAsia Centre."
for paragraph in main_content.find_all('p'):
    paragraph_text = paragraph.get_text(separator=' ', strip=False)

    if stop_text in paragraph_text:
        text_list.append(paragraph_text)
        break
    text_list.append(paragraph_text)

In [7]:
article_text = '\n'.join(text_list)

In [8]:
print(article_text)

James Bowen argues that clean energy cooperation would be a win for both the climate and stability of the long-standing Indo-Pacific order and urges the United States and other advanced regional economies to revive the spirit of common cause that followed past energy crises.
Ensuring a rapid global transition to clean energy systems is the overriding priority of international climate action. Cross-border cooperation in this space is critical, yet it has proved exceedingly difficult at the all-inclusive UN-led level. Smaller avenues of parallel activity could ultimately deliver more meaningful progress. Collaborative efforts that simultaneously allow space for the advancement of national economic and strategic positions are a prominent feature of current Indo-Pacific relations. They merit sustained commitment from the United States and its regional allies and partners, particularly at a time of upheaval in global energy markets.
The Intergovernmental Panel on Climate Change’s April 2022

In [9]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [10]:
import nltk
import re
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/loogyee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def preprocess_text(text):
    # Lowercase the entire text
    text = text.lower()
    
    # Initialize the lemmatizer and stemmer
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    
    # Lemmatization and Stemming
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    stemmed_words = [stemmer.stem(word) for word in words]
    
    processed_text = ' '.join(lemmatized_words)

    return processed_text

In [13]:
cleaned_text = preprocess_text(article_text)

In [14]:
print(cleaned_text)

james bowen argues that clean energy cooperation would be a win for both the climate and stability of the long-standing indo-pacific order and urge the united state and other advanced regional economy to revive the spirit of common cause that followed past energy crises. ensuring a rapid global transition to clean energy system is the overriding priority of international climate action. cross-border cooperation in this space is critical, yet it ha proved exceedingly difficult at the all-inclusive un-led level. smaller avenue of parallel activity could ultimately deliver more meaningful progress. collaborative effort that simultaneously allow space for the advancement of national economic and strategic position are a prominent feature of current indo-pacific relations. they merit sustained commitment from the united state and it regional ally and partners, particularly at a time of upheaval in global energy markets. the intergovernmental panel on climate change’s april 2022 report noted

In [15]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [18]:
doc = nlp(cleaned_text)
unique_entities = set()

for ent in doc.ents:
    unique_entities.add((ent.text, ent.label_))
for entity, label in unique_entities:
    print(f"{entity} - {label}")

u.s. - GPE
japan - GPE
asia - LOC
one - CARDINAL
december 2021 - DATE
new delhi - GPE
chinese - NORP
southeast asia - LOC
singapore - GPE
gina raimondo - PERSON
james bowen - PERSON
jakarta - GPE
2021 - DATE
national clean energy transition require mass deployment - ORG
seoul - ORG
5,000-kilometer-long - QUANTITY
russia - GPE
korean - NORP
australia - GPE
april 2022 - DATE
seoul - GPE
the perth usasia centre - ORG
july - DATE
1973 - DATE
more than a decade - DATE
recent years - DATE
south korea - GPE
russian - NORP
march 2022 - DATE
the united states - GPE
first - ORDINAL
commerce - ORG
late 2021 - DATE
2022 - DATE
japanese - NORP
non-russian - NORP
the united state - ORG
three - CARDINAL
china - GPE
$500 million - MONEY
second - ORDINAL
northeast asian - NORP
european - NORP
a recent $9 billion - MONEY
coming years - DATE
fourth - ORDINAL
un - ORG
indonesian - NORP
tokyo - GPE
beijing - GPE
washington - GPE
the japan bank - ORG
four - CARDINAL
joe biden - PERSON
third - ORDINAL
india 

In [19]:
important_deps = ["nsubj", "dobj", "ROOT", "xcomp", "ccomp", "conj", "amod"]
print("\nDependency Parsing (Relationships):")
for token in doc:
    if token.dep_ in important_deps:
        print(f"{token.text} --> {token.dep_} --> {token.head.text}")


Dependency Parsing (Relationships):
bowen --> nsubj --> argues
argues --> ROOT --> argues
clean --> amod --> cooperation
cooperation --> nsubj --> be
be --> ccomp --> argues
stability --> conj --> climate
standing --> amod --> order
pacific --> amod --> order
urge --> conj --> be
state --> dobj --> urge
other --> amod --> economy
advanced --> amod --> economy
regional --> amod --> economy
economy --> conj --> state
revive --> xcomp --> urge
spirit --> dobj --> revive
common --> amod --> cause
that --> nsubj --> followed
past --> amod --> crises
crises --> dobj --> followed
rapid --> amod --> transition
global --> amod --> transition
transition --> dobj --> ensuring
system --> dobj --> clean
is --> ROOT --> is
overriding --> amod --> priority
international --> amod --> action
cross --> amod --> cooperation
- --> amod --> cooperation
border --> amod --> cooperation
cooperation --> nsubj --> is
is --> ROOT --> is
it --> nsubj --> proved
proved --> conj --> is
inclusive --> amod --> level

In [20]:
from spacy import displacy

In [21]:
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style='dep', jupyter=True, options={'compact': True})

In [22]:
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="ent", jupyter=True)



In [23]:
relationship_keywords = {'collaboration', 'cooperation', 'partner', 'engage', 
                         'support', 'fund', 'finance', 'invest', 'develop', 
                         'supply', 'promote', 'transition', 'export', 'agreement', 'interest'
                         'share', 'enhance'}

In [24]:
entity1_list = []
relationship_list = []
entity2_list = []
date_list = []
money_list = []

In [25]:
for sent in doc.sents:
    # Extract named entities (countries, organizations)
    entities = []
    money_in_sentence = []
    for ent in sent.ents:
        if ent.label_ == 'FAC' and 'Belt and Road' in ent.text:
            entities.append(ent._replace(label_='POL'))
        elif ent.label_ in ['GPE', 'ORG', 'POL', 'MONEY']:
            entities.append(ent)
    
        # Collect monetary amounts
        if ent.label_ == "MONEY":
            money_in_sentence.append(ent.text)
            
    # Look for verbs or specific relationship nouns in the sentence
    verbs = [token for token in sent if token.pos_ == "VERB"]
    keywords_in_sentence = [token.text for token in sent if token.lemma_ in relationship_keywords]
    
    # Extract dates
    dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]
    
    
    # If we find two entities and a relationship keyword, we assume a relationship
    if len(entities) >= 2 and keywords_in_sentence:
        entity1_list.append(entities[0].text)
        
        # Concatenate the relationship noun and verb (if available) into one string
        relationship_info = keywords_in_sentence[0]
        if verbs:
            relationship_info += " (verb: " + verbs[0].lemma_ + ")"
        
        relationship_list.append(relationship_info)  # store concatenated info
        entity2_list.append(entities[1].text)
        date_list.append(dates[0] if dates else "n/a")
        money_list.append(money_in_sentence[0] if money_in_sentence else "n/a")

In [26]:
import pandas as pd

In [27]:
# DataFrame
df = pd.DataFrame({
    "Entity1": entity1_list,
    "Relationship": relationship_list,
    "Entity2": entity2_list,
    "Date": date_list,
    "Money": money_list
})

In [28]:
df

Unnamed: 0,Entity1,Relationship,Entity2,Date,Money
0,tokyo,transition (verb: mirror),japan,,
1,$500 million,finance (verb: announce),u.s.,december 2021,$500 million
2,u.s.,export (verb: commit),commerce,march 2022,
3,the united states,developing (verb: coordinate),japan,,


In [29]:
# Export to csv
df.to_csv("international_cooperation_renewable_energy_2.csv", index=False)