In [1]:
import requests 
from bs4 import BeautifulSoup

In [2]:
url = "https://www.climatecouncil.org.au/resources/australia-key-role-to-play-clean-energy-reshapes-indo-pacific-relations/"
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url, headers=headers)

In [3]:
soup=BeautifulSoup(response.content, "html.parser")

In [4]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en-GB"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8" lang="en-GB"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9" lang="en-GB"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-GB">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" name="viewport"/>
  <link href="http://gmpg.org/xfn/11" rel="profile"/>
  <link href="https://www.climatecouncil.org.au/xmlrpc.php" rel="pingback"/>
  <link href="https://www.climatecouncil.org.au/wp-content/themes/climate-council/assets/dist/img/favicon.png" rel="shortcut icon" type="image/png"/>
  <link href="//cdn.jsdelivr.net/jquery.slick/1.6.0/slick.css" rel="stylesheet" type="text/css">
   <meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots">
    <!-- Google Tag Mana

In [5]:
main_content = soup.find('article')
for element in main_content(['header', 'footer', 'aside', 'script']):
    element.decompose()

In [6]:
text_list = []
stop_text = "The time is right for Australia to accelerate clean energy and associated climate action and secure its economic and strategic advantage in the Indo-Pacific’s clean energy future."
for paragraph in main_content.find_all('p'):
    paragraph_text = paragraph.get_text(separator=' ', strip=False)

    if stop_text in paragraph_text:
        text_list.append(paragraph_text)
        break
    text_list.append(paragraph_text)

In [7]:
article_text = '\n'.join(text_list)

In [8]:
print(article_text)

With the incoming federal government declaring it will make Australia a “renewable superpower”, China dominating clean energy supply chains, and the Russian war in Ukraine disrupting the global energy market, now is a pivotal moment for Australia to shape the future of clean energy within the Indo-Pacific.
A  new report by the Perth USAsia Centre, in collaboration with the Climate Council , recommends five practical actions for Australia to secure its economic and strategic clean energy advantages in the Indo-Pacific :
Published ahead of a major international forum on energy supply chains – the  Sydney Energy Forum  (12-13 July) – the report,  Reenergising Indo-Pacific Relations: Australia’s Clean Energy Opportunity ,  explains that the Indo-Pacific sits at the heart of the global shift from fossil fuels to clean energy systems, the ramifications for Australia’s economic and strategic interests, and the leading role that our country could play.
Global warming is a significant security 

In [9]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [10]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/loogyee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/loogyee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/loogyee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
def preprocess_text(text):
    # Sentence Tokenization
    sentences = sent_tokenize(text)
    
    # Word Tokenization
    words = word_tokenize(text.lower())  # Lowercase all words
    
    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    words_filtered = [word for word in words if word.isalnum() and word not in stop_words]
    
    # Lemmatization and Stemming
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words_filtered]
    stemmed_words = [stemmer.stem(word) for word in words_filtered]
    
    # Regular Expression for Additional Cleaning (Removing Punctuation)
    words_cleaned = [re.sub(r'\W+', '', word) for word in lemmatized_words]
    
    return words_cleaned

In [13]:
processed_words = preprocess_text(article_text)

In [14]:
pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0.tar.gz (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [16]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [17]:
doc = nlp(article_text)
unique_entities = set()

for ent in doc.ents:
    unique_entities.add((ent.text, ent.label_))
for entity, label in unique_entities:
    print(f"{entity} - {label}")

Korea - GPE
China - GPE
12-13 July - DATE
the Indo-Pacific’s - ORG
five - CARDINAL
Indo-Pacific - ORG
Clean Energy Opportunity - ORG
the Indo-Pacific - ORG
the Climate Council - ORG
Climate Council Senior Researcher - ORG
Reenergising Indo-Pacific Relations - ORG
Russian - NORP
Ukraine - GPE
Japan - GPE
Sydney Energy Forum - ORG
Australian - NORP
Bowen - PERSON
Wesley Morgan - PERSON
Australia - GPE
the Perth USAsia Centre - ORG
James Bowen - PERSON


In [18]:
important_deps = ["nsubj", "dobj", "ROOT", "xcomp", "ccomp", "conj", "amod"]
print("\nDependency Parsing (Relationships):")
for token in doc:
    if token.dep_ in important_deps:
        print(f"{token.text} --> {token.dep_} --> {token.head.text}")


Dependency Parsing (Relationships):
incoming --> amod --> government
federal --> amod --> government
government --> nsubj --> declaring
it --> nsubj --> make
make --> ccomp --> declaring
Australia --> nsubj --> superpower
renewable --> amod --> superpower
superpower --> ccomp --> make
China --> nsubj --> is
clean --> amod --> chains
chains --> dobj --> dominating
Russian --> amod --> war
war --> conj --> China
global --> amod --> market
market --> dobj --> disrupting
is --> ROOT --> is
pivotal --> amod --> moment
Australia --> nsubj --> shape
future --> dobj --> shape
clean --> amod --> energy
new --> amod --> report
report --> nsubj --> recommends
recommends --> ccomp --> explains
practical --> amod --> actions
actions --> dobj --> recommends
Australia --> nsubj --> secure
economic --> amod --> clean
strategic --> conj --> economic
clean --> amod --> advantages
advantages --> dobj --> secure
major --> amod --> forum
international --> amod --> forum
explains --> ROOT --> explains
Paci

In [19]:
from spacy import displacy

In [20]:
displacy.render(doc, style='dep', jupyter=True, options={'compact': True})

In [55]:
sentence_spans = list(doc.sents)
displacy.render(sentence_spans, style="ent", jupyter=True)



In [56]:
relationship_keywords = ['collaboration', 'cooperation', 'agreement', 'partnership', 'support', 'investment', 'effort', 'help']

In [57]:
entity1_list = []
relationship_list = []
entity2_list = []
date_list = []

In [58]:
for sent in doc.sents:
    # Extract named entities (countries, organizations)
    entities = [ent for ent in sent.ents if ent.label_ in ['GPE', 'ORG']]
    
    # Look for verbs or specific relationship nouns in the sentence
    verbs = [token for token in sent if token.pos_ == "VERB"]
    keywords_in_sentence = [token.text for token in sent if token.lemma_ in relationship_keywords]
    
    # Extract dates
    dates = [ent.text for ent in sent.ents if ent.label_ == "DATE"]
    
    # If we find two entities and a relationship keyword, we assume a relationship
    if len(entities) >= 2 and keywords_in_sentence:
        entity1_list.append(entities[0].text)
        
        # Concatenate the relationship noun and verb (if available) into one string
        relationship_info = keywords_in_sentence[0]
        if verbs:
            relationship_info += " (verb: " + verbs[0].lemma_ + ")"
        
        relationship_list.append(relationship_info)  # store concatenated info
        entity2_list.append(entities[1].text)
        date_list.append(dates[0] if dates else "Unknown")

In [59]:
import pandas as pd

In [60]:
# DataFrame
df = pd.DataFrame({
    "Entity1": entity1_list,
    "Relationship": relationship_list,
    "Entity2": entity2_list,
    "Date": date_list
})

In [61]:
df

Unnamed: 0,Entity1,Relationship,Entity2,Date
0,the Perth USAsia Centre,collaboration (verb: recommend),the Climate Council,12-13 July
1,Sydney Energy Forum,cooperation (verb: present),Australia,Unknown


In [None]:
# Export to csv
df.to_csv("international_cooperation_renewable_energy_1.csv", index=False)