In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy

In [2]:
url = "https://www.climatecouncil.org.au/resources/australia-key-role-to-play-clean-energy-reshapes-indo-pacific-relations/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers)

In [3]:
soup=BeautifulSoup(response.content, "html.parser")

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en-GB"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8" lang="en-GB"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9" lang="en-GB"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-GB">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" name="viewport"/>
  <link href="http://gmpg.org/xfn/11" rel="profile"/>
  <link href="https://www.climatecouncil.org.au/xmlrpc.php" rel="pingback"/>
  <link href="https://www.climatecouncil.org.au/wp-content/themes/climate-council/assets/dist/img/favicon.png" rel="shortcut icon" type="image/png"/>
  <link href="//cdn.jsdelivr.net/jquery.slick/1.6.0/slick.css" rel="stylesheet" type="text/css">
   <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
    <!-- Google Tag Mana

In [5]:
main_content = soup.find('article')
for element in main_content(['header', 'footer', 'aside', 'script']):
    element.decompose()

In [6]:
text_list = []
stop_text = "The time is right for Australia to accelerate clean energy and associated climate action and secure its economic and strategic advantage in the Indo-Pacific’s clean energy future."
for paragraph in main_content.find_all('p'):
    paragraph_text = paragraph.get_text(separator=' ', strip=False)

    if stop_text in paragraph_text:
        text_list.append(paragraph_text)
        break
    text_list.append(paragraph_text)

In [7]:
df = pd.DataFrame(text_list, columns=["Paragraph Text"])
df.to_csv('01_paragraphs.csv', index=False)

In [8]:
df

Unnamed: 0,Paragraph Text
0,With the incoming federal government declaring...
1,"A new report by the Perth USAsia Centre, in c..."
2,Published ahead of a major international forum...
3,Global warming is a significant security threa...
4,“The necessary transition from fossil fuel to ...
5,China’s current dominance in global clean ener...
6,Australia’s role in the regional energy landsc...
7,“Australia has emerging clean energy strengths...
8,Climate Council Senior Researcher Dr Wesley Mo...
9,“There can be no doubt that Australia’s econom...


In [9]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [10]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/loogyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/loogyee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
nlp = spacy.load("en_core_web_sm")

In [12]:
df = pd.read_csv('01_paragraphs.csv')

In [13]:
# Preprocessing 
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase the entire text
    text = text.lower()
    
    # Tokenize and lemmatize
    lemmatized_words = []
    for word in text.split():  # Split the text into words
        lemmatized_words.append(lemmatizer.lemmatize(word))  # Lemmatize each word
    
    # Join lemmatized words back into a single string
    cleaned_text = ' '.join(lemmatized_words)
    
    return cleaned_text

In [14]:
df['Processed Text'] = df['Paragraph Text'].apply(preprocess_text)

In [15]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [16]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [17]:
relationship_keywords = {'collaboration', 'cooperation', 'partner', 'engage', 
                         'support', 'fund', 'finance', 'invest', 'develop', 
                         'supply', 'promote', 'transition', 'export', 'agreement', 
                         'interest', 'share', 'enhance', 'effort', 'import'}

In [18]:
def extract_entities_and_relationships(text):
    doc = nlp(text)  # Process text with spaCy
    entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in ['ORG', 'GPE']]  # Extract only ORG and GPE entities

    # Find relationships based on keywords
    relationships = []
    for token in doc:
        if token.text in relationship_keywords:  # Check if the token matches any relationship keyword
            subject = [w for w in token.children if w.dep_ == "nsubj"]  # Find the subject
            object_ = [w for w in token.children if w.dep_ == "dobj"]  # Find the direct object
            if subject and object_:
                relationships.append((subject[0].text, token.text, object_[0].text))  # (subject, relationship keyword, object)

    return entities, relationships

In [19]:
df['Entities'], df['Relationships'] = zip(*df['Processed Text'].apply(extract_entities_and_relationships))

In [20]:
df[['Processed Text', 'Entities', 'Relationships']]

Unnamed: 0,Processed Text,Entities,Relationships
0,with the incoming federal government declaring...,"[(china, GPE), (ukraine, GPE), (australia, GPE)]",[]
1,"a new report by the perth usasia centre, in co...","[(the perth usasia centre, ORG)]",[]
2,published ahead of a major international forum...,"[(australia, GPE), (australia, GPE)]",[]
3,global warming is a significant security threa...,"[(china, GPE)]",[]
4,“the necessary transition from fossil fuel to ...,[],[]
5,china’s current dominance in global clean ener...,"[(china, GPE), (australia, GPE)]",[]
6,australia’s role in the regional energy landsc...,"[(australia, GPE), (australia, GPE), (japan, G...",[]
7,“australia ha emerging clean energy strength i...,"[(australia, GPE)]",[]
8,climate council senior researcher dr wesley mo...,"[(climate council, ORG)]",[]
9,“there can be no doubt that australia’s econom...,"[(australia, GPE)]",[]


In [21]:
for index, row in df.iterrows():
    print(f"\nParagraph {index + 1}:")
    print(row['Paragraph Text'])

    # Display dependency parsing
    doc = nlp(row['Processed Text'])
    sentence_spans = list(doc.sents)
    displacy.render(sentence_spans, style='dep', jupyter=True, options={'compact': True})
    
    # Display named entities
    displacy.render(sentence_spans, style="ent", jupyter=True)

    # Print extracted entities and relationships
    print("Entities:", row['Entities'])


Paragraph 1:
With the incoming federal government declaring it will make Australia a “renewable superpower”, China dominating clean energy supply chains, and the Russian war in Ukraine disrupting the global energy market, now is a pivotal moment for Australia to shape the future of clean energy within the Indo-Pacific.


Entities: [('china', 'GPE'), ('ukraine', 'GPE'), ('australia', 'GPE')]

Paragraph 2:
A  new report by the Perth USAsia Centre, in collaboration with the Climate Council , recommends five practical actions for Australia to secure its economic and strategic clean energy advantages in the Indo-Pacific :


Entities: [('the perth usasia centre', 'ORG')]

Paragraph 3:
Published ahead of a major international forum on energy supply chains – the  Sydney Energy Forum  (12-13 July) – the report,  Reenergising Indo-Pacific Relations: Australia’s Clean Energy Opportunity ,  explains that the Indo-Pacific sits at the heart of the global shift from fossil fuels to clean energy systems, the ramifications for Australia’s economic and strategic interests, and the leading role that our country could play.


Entities: [('australia', 'GPE'), ('australia', 'GPE')]

Paragraph 4:
Global warming is a significant security threat for all countries, but those within the Indo-Pacific region are among  the most vulnerable. More affordable and available renewables can improve energy security and erode troublesome interdependencies in the region. However, significant obstacles must be overcome, including China’s dominance in the global energy sector.    




Entities: [('china', 'GPE')]

Paragraph 5:
“The necessary transition from fossil fuel to clean energy systems is reordering Indo-Pacific relations. Australian government and industry should work to ensure emerging supply chains and interdependencies advance our national interest,”  said report author James Bowen.


Entities: []

Paragraph 6:
China’s current dominance in global clean energy sectors has created vulnerabilities for both the energy transition, and broader system of Indo-Pacific relations. Diversifying supply chains and relationships is pivotal to the clean energy transition, and countries such as Australia have untapped potential to improve supply chain resilience and create a fairer and well-governed Indo-Pacific transition.


Entities: [('china', 'GPE'), ('australia', 'GPE')]

Paragraph 7:
Australia’s role in the regional energy landscape is currently as a major supplier of coal and gas to economies in the Indo-Pacific. However, the shift toward net-zero emissions has profoundly changed Australia’s economic prospects. Economies such as Japan and Korea will continue to want Australian exports, but they now expect clean energy alternatives.


Entities: [('australia', 'GPE'), ('australia', 'GPE'), ('japan', 'GPE'), ('korea', 'GPE')]

Paragraph 8:
“Australia has emerging clean energy strengths in the critical minerals, technology, resources and industrial goods sectors. Enhancing cooperation with partners and allies could unlock the country’s vast economic and strategic potential in a decarbonising Indo-Pacific,”  continued Bowen.


Entities: [('australia', 'GPE')]

Paragraph 9:
Climate Council Senior Researcher Dr Wesley Morgan added:  “The growing economic advantages of renewable energy are driving an unprecedented global energy transition and the Indo-Pacific is at the heart of it. The upcoming Sydney Energy Forum presents a huge opportunity for Australia to shape the future of clean energy cooperation in the Indo-Pacific. 


Entities: [('climate council', 'ORG')]

Paragraph 10:
“There can be no doubt that Australia’s economic and strategic interests are now tied to leading a rapid clean energy transition.”


Entities: [('australia', 'GPE')]

Paragraph 11:
With both its natural resources and geopolitical alignments, Australia has the potential to become a clean energy superpower. The time is right for Australia to accelerate clean energy and associated climate action and secure its economic and strategic advantage in the Indo-Pacific’s clean energy future.


Entities: [('australia', 'GPE')]


In [22]:
entity1_list = []
relationship_list = []
entity2_list = []
date_list = []
money_list = []

In [23]:
# Iterate through each paragraph
for index, row in df.iterrows():
    doc = nlp(row['Paragraph Text'])  # Process the paragraph with spaCy

    for sent in doc.sents:
        # Extract named entities (countries, organizations)
        entities = []
        money_in_sentence = []

        for ent in sent.ents:
            if ent.label_ == 'Initiative' in ent.text:
                entities.append(ent._replace(label_='POL'))  # Change label for specific case
            elif ent.label_ in ['GPE', 'ORG', 'POL', 'MONEY']:
                entities.append(ent)

            # Collect monetary amounts
            if ent.label_ == "MONEY":
                money_in_sentence.append(ent.text)
        
        # Look for verbs or specific relationship nouns in the sentence
        verbs = [token for token in sent if token.pos_ == "VERB"]
        keywords_in_sentence = [token.text for token in sent if token.lemma_ in relationship_keywords]
        
        # Extract dates
        dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]

        # If we find two entities and a relationship keyword, we assume a relationship
        if len(entities) >= 2 and keywords_in_sentence:
            entity1_list.append(entities[0].text)

            # Concatenate the relationship noun and verb (if available) into one string
            relationship_info = keywords_in_sentence[0]
            if verbs:
                relationship_info += " (verb: " + verbs[0].lemma_ + ")"

            relationship_list.append(relationship_info)  # Store concatenated info
            entity2_list.append(entities[1].text)
            date_list.append(dates[0] if dates else "n/a")
            money_list.append(money_in_sentence[0] if money_in_sentence else "n/a")

In [24]:
# DataFrame
extracted_data = pd.DataFrame({
    "Entity1": entity1_list,
    "Relationship": relationship_list,
    "Entity2": entity2_list,
    "Date": date_list,
    "Money": money_list
})

In [25]:
extracted_data

Unnamed: 0,Entity1,Relationship,Entity2,Date,Money
0,Australia,supply (verb: declare),China,,
1,the Perth USAsia Centre,collaboration (verb: recommend),the Climate Council,,
2,Sydney Energy Forum,supply (verb: publish),Reenergising Indo-Pacific Relations,12-13 July,
3,China,transition (verb: create),Indo-Pacific,,
4,Australia,supply (verb: diversify),Indo-Pacific,,
5,Japan,exports (verb: continue),Korea,,
6,Climate Council Senior Researcher,transition (verb: add),the Indo-Pacific,,
7,Sydney Energy Forum,cooperation (verb: present),Australia,,


In [26]:
extracted_data.to_csv('01_relationships.csv', index=False)