In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy

In [2]:
url = "https://www.lowyinstitute.org/the-interpreter/powerhouse-clean-energy-transitions-indo-pacific#:~:text=Indo-Pacific"
response = requests.get(url)

In [3]:
soup=BeautifulSoup(response.content, "html.parser")

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
 <head>
  <meta charset="utf-8"/>
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-3115931-1">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("set", "developer_id.dMDhkMT", true);gtag("config", "UA-3115931-1", {"groups":"default","anonymize_ip":true,"page_placeholder":"PLACEHOLDER_page_path"});gtag("config", "G-2Q1HP4FKQV", {"groups":"default","page_placeholder":"PLACEHOLDER_page_location"});
  </script>
  <script src="/sites/default/files/hotjar/hotjar.script.js?sj

In [5]:
paragraphs = soup.find_all('p')

In [6]:
main_content = soup.find('div', {'class': 'article-content clearfix'})
for element in main_content(['a']):
    element.decompose()
    
text_list = []
for paragraph in main_content.find_all('p'):
    paragraph_text = paragraph.get_text(separator=' ', strip=True)
    text_list.append(paragraph_text)

In [7]:
df = pd.DataFrame(text_list, columns=["Paragraph Text"])
df.to_csv('04_paragraphs.csv', index=False)

In [8]:
df

Unnamed: 0,Paragraph Text
0,With a global energy crisis underway due to ri...
1,An energy transition is broadly defined as the...
2,"Over the past decades, the classical distincti..."
3,The ongoing transition away from fossil fuels ...
4,Instances of such shifts are already evident i...
5,"Meanwhile, key Indo-Pacific players such as Ja..."
6,"In countries such as , in an attempt to secure..."
7,With the continuous growth in solar and wind e...
8,"In 2010, China imposed an on rare earths to Ja..."
9,As China and Russia continue to weaponise thes...


In [9]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [10]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/loogyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/loogyee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
df = pd.read_csv('04_paragraphs.csv')

In [13]:
# Preprocessing 
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the entire text
    text = text.lower()
    
    # Tokenize and lemmatize
    lemmatized_words = []
    for word in text.split():  # Split the text into words
        lemmatized_words.append(lemmatizer.lemmatize(word))  # Lemmatize each word
    
    # Join lemmatized words back into a single string
    cleaned_text = ' '.join(lemmatized_words)
    
    return cleaned_text

In [14]:
df['Processed Text'] = df['Paragraph Text'].apply(preprocess_text)

In [15]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz (12.8 MB)
  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [17]:
relationship_keywords = {'collaboration', 'cooperation', 'partner', 'engage', 
                         'support', 'fund', 'finance', 'invest', 'develop', 
                         'supply', 'promote', 'transition', 'export', 'agreement', 
                         'interest', 'share', 'enhance', 'effort', 'import'}

In [18]:
def extract_entities_and_relationships(text):
    doc = nlp(text)  # Process text with spaCy
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['ORG', 'GPE']]

    # Find relationships based on keywords
    relationships = []
    for token in doc:
        if token.text in relationship_keywords:  # Check if the token matches any relationship keyword
            subject = [w for w in token.children if w.dep_ == "nsubj"]  # Find the subject
            object_ = [w for w in token.children if w.dep_ == "dobj"]  # Find the direct object
            if subject and object_:
                relationships.append((subject[0].text, token.text, object_[0].text))  # (subject, relationship keyword, object)

    return entities, relationships

In [19]:
df['Entities'], df['Relationships'] = zip(*df['Processed Text'].apply(extract_entities_and_relationships))

In [20]:
df[['Processed Text', 'Entities', 'Relationships']]

Unnamed: 0,Processed Text,Entities,Relationships
0,with a global energy crisis underway due to ri...,"[(russia, GPE)]",[]
1,an energy transition is broadly defined a the ...,[],[]
2,"over the past decades, the classical distincti...","[(the united state, GPE)]",[]
3,the ongoing transition away from fossil fuel w...,"[(hydrogen, GPE)]",[]
4,instance of such shift are already evident in ...,"[(saudi arabia, GPE), (japan, GPE), (the abu d...",[]
5,"meanwhile, key indo-pacific player such a japa...","[(japan, GPE), (australia, GPE), (asean), ORG)...",[]
6,"in country such a , in an attempt to secure en...","[(russia, GPE), (russia, GPE)]",[]
7,with the continuous growth in solar and wind e...,"[(china, GPE)]",[]
8,"in 2010, china imposed an on rare earth to jap...","[(china, GPE), (japan, GPE), (china, GPE)]",[]
9,a china and russia continue to weaponise these...,"[(china, GPE), (russia, GPE)]",[]


In [21]:
for index, row in df.iterrows():
    print(f"\nParagraph {index + 1}:")
    print(row['Paragraph Text'])

    # Display dependency parsing
    doc = nlp(row['Processed Text'])
    sentence_spans = list(doc.sents)
    displacy.render(sentence_spans, style='dep', jupyter=True, options={'compact': True})
    
    # Display named entities
    displacy.render(sentence_spans, style="ent", jupyter=True)

    # Print extracted entities and relationships
    print("Entities:", row['Entities'])


Paragraph 1:
With a global energy crisis underway due to rising , coupled with a of a harsh northern hemisphere winter and supply volatility brought on by the Russia–Ukraine war, all eyes are on the . The Indo-Pacific will be at the centre of this transition, with rapidly expanding Southeast Asian economies and burgeoning populations forming a large share of the exponentially rising global energy demand.


Entities: [('russia', 'GPE')]

Paragraph 2:
An energy transition is broadly defined as the pathway to transforming the energy mix towards low carbon, sustainable and renewable forms of energy. One of its key components is maintaining energy security. Traditionally, energy security has encapsulated the 4 As – availability, accessibility, affordability and acceptability – but scholars today also focus on defined aspects such as minimal vulnerability, enhanced resilience and equitable access to energy.




Entities: []

Paragraph 3:
Over the past decades, the classical distinction between importers and exporters of energy has blurred. A traditional importer such as the United States is today a crucial energy exporter after its shale gas . Further transformations in the energy landscape due to the global clean energy transition are expected to augment the energy self-sufficiency ratio of nations as their dependency on domestic renewable sources increases, leading to improved energy security. While this might be the case, the belief that such transitions could lead to reduced volatility of energy security is far-fetched.


Entities: [('the united state', 'GPE')]

Paragraph 4:
The ongoing transition away from fossil fuels will usher in a growing reliance on procuring supplies of hydrogen, ammonia, biofuels and other alternatives. Renewables will therefore alter the arenas of energy interaction, transform traditional energy markets and mark a shift in trading partners while reshaping patterns of conflict and cooperation between countries. Since the Indo-Pacific will be a major energy hub that houses critical and strategic energy trade routes, this reshaping will be most pronounced in the region.


Entities: [('hydrogen', 'GPE')]

Paragraph 5:
Instances of such shifts are already evident in the urgency to decarbonise and diversify energy sources. In 2020, traditional oil giant Saudi Arabia demonstrated the world’s first successful supply network by producing and shipping 40 tonnes of high-grade low-carbon fuel to Japan. Similarly, Japanese oil company successfully brought clean ammonia produced by the Abu Dhabi National Oil Company to Japan. Furthermore, Singapore was the major export destination of from India in the 2020–21 financial year.


Entities: [('saudi arabia', 'GPE'), ('japan', 'GPE'), ('the abu dhabi national oil company', 'ORG'), ('japan', 'GPE'), ('singapore', 'GPE'), ('india', 'GPE')]

Paragraph 6:
Meanwhile, key Indo-Pacific players such as Japan and Australia are fuelling their bilateral relationship through hydrogen, with the world’s first carrier ship completing its of the fuel from Victoria to Kobe in February this year. is also preparing to jump on the hydrogen exporter bandwagon. Among the Association of Southeast Asian Nations (ASEAN), Singapore imported hydropower-generated clean electricity from via Thailand and Malaysia through grid interconnections for the first time in June. Countries such as have also started exporting small quantities of hydrogen to Japan. Although new and emerging interdependent energy relations are forming within the Indo-Pacific, fault lines have also emerged, with recently announcing a ban on green energy export, putting a halt to the vision of an integrated ASEAN power grid

Entities: [('japan', 'GPE'), ('australia', 'GPE'), ('asean)', 'ORG'), ('singapore', 'GPE'), ('thailand', 'GPE'), ('malaysia', 'GPE'), ('japan', 'GPE'), ('asean power grid', 'ORG')]

Paragraph 7:
In countries such as , in an attempt to secure energy supplies while meeting sustainability goals, nuclear energy is again being debated as a potential addition to the energy mix. However,  a rise in the use of nuclear power would result in an increased reliance on uranium for many countries. It is interesting to note here that Russia is a key exporter of uranium to the and , and with the fuel not being subject to the ongoing Russian sanctions, it adds to the debate on reliance on Russia and energy security.


Entities: [('russia', 'GPE'), ('russia', 'GPE')]

Paragraph 8:
With the continuous growth in solar and wind energy across the region, the critical minerals essential to manufacturing these technologies will become increasingly important. Since these are concentrated in specific nations, particularly China, diversifying the sources and enabling stable supply becomes imperative to expanding renewable energies and ensuring energy security.


Entities: [('china', 'GPE')]

Paragraph 9:
In 2010, China imposed an on rare earths to Japan due to a territorial dispute. It is anticipated that the energy-intensive undertones of China’s flagship will impact the geopolitics around energy trade routes, especially in the Indo-Pacific. Such a scenario increases the need for regional countries to embark on a strategy to secure energy supply routes and affordable energy supplies. Recently, and agreed to boost cooperation on global supply chains, especially concerning rare earths.


Entities: [('china', 'GPE'), ('japan', 'GPE'), ('china', 'GPE')]

Paragraph 10:
As China and Russia continue to weaponise these supply capacities to exert geopolitical leverage to coerce other countries for their political purposes, it highlights the prominence of geopolitics of energy. The increasing risk of geopolitical turbulence, as seen recently with growing tensions in , makes it imperative to promote shared interests and principles of energy security in the region.


Entities: [('china', 'GPE'), ('russia', 'GPE')]

Paragraph 11:
The future of smooth energy transitions and the development of a peaceful and stable Indo-Pacific rely on a deeper understanding of evolving energy security and fostering a rules-based order safeguarding the same. As the Indo-Pacific comes under the spotlight with major players attempting to expand their influence, energy can act as a means to further relations and interests. Dialogues such as the are setting the stage. But is the Indo-Pacific prepared for the new contours of energy security?


Entities: []


In [22]:
entity1_list = []
relationship_list = []
entity2_list = []
date_list = []
money_list = []

In [23]:
# Iterate through each paragraph
for index, row in df.iterrows():
    doc = nlp(row['Paragraph Text'])  # Process the paragraph with spaCy

    for sent in doc.sents:
        # Extract named entities (countries, organizations)
        entities = []
        money_in_sentence = []

        for ent in sent.ents:
            if ent.label_ == 'Initiative' in ent.text:
                entities.append(ent._replace(label_='POL'))  # Change label for specific case
            elif ent.label_ in ['GPE', 'ORG', 'POL', 'MONEY']:
                entities.append(ent)

            # Collect monetary amounts
            if ent.label_ == "MONEY":
                money_in_sentence.append(ent.text)
        
        # Look for verbs or specific relationship nouns in the sentence
        verbs = [token for token in sent if token.pos_ == "VERB"]
        keywords_in_sentence = [token.text for token in sent if token.lemma_ in relationship_keywords]
        
        # Extract dates
        dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]

        # If we find two entities and a relationship keyword, we assume a relationship
        if len(entities) >= 2 and keywords_in_sentence:
            entity1_list.append(entities[0].text)

            # Concatenate the relationship noun and verb (if available) into one string
            relationship_info = keywords_in_sentence[0]
            if verbs:
                relationship_info += " (verb: " + verbs[0].lemma_ + ")"

            relationship_list.append(relationship_info)  # Store concatenated info
            entity2_list.append(entities[1].text)
            date_list.append(dates[0] if dates else "n/a")
            money_list.append(money_in_sentence[0] if money_in_sentence else "n/a")

In [24]:
# DataFrame
extracted_data = pd.DataFrame({
    "Entity1": entity1_list,
    "Relationship": relationship_list,
    "Entity2": entity2_list,
    "Date": date_list,
    "Money": money_list
})

In [25]:
extracted_data

Unnamed: 0,Entity1,Relationship,Entity2,Date,Money
0,Saudi Arabia,supply (verb: demonstrate),Japan,2020,
1,Singapore,export,India,the 2020–21 financial year,
2,the Association of Southeast Asian Nations,imported (verb: import),ASEAN,June,
3,the Indo-Pacific,export (verb: emerge),ASEAN,,
4,China,supply (verb: continue),Russia,,


In [26]:
# Export to csv
extracted_data.to_csv('04_relationships.csv', index=False)