# **Step 1: Installing libraries for Web Scrapping.**

In [1]:
pip install langchain

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.134-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310

In [2]:
pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloa

# **Step 2: Web Scrapping using "WebBaseLoader" (Data related to Public Opinion and Stakeholder info).**

In [3]:
# Loading libraries.
from langchain_community.document_loaders import WebBaseLoader

# URL lists categorized by regions and topics.
aus_public_opinion = [
    "https://world-nuclear-news.org/Articles/National-poll-shows-shift-in-Australian-nuclear-op",
    "https://poll.lowyinstitute.org/charts/nuclear-power-in-australia/",
    "https://poll.lowyinstitute.org/charts/australia-using-nuclear-power-to-generate-energy/",
    "https://australiainstitute.org.au/report/polling-willingness-to-pay-for-nuclear/",
    "https://www.tenmenelectrical.com/solar-power-is-australians-most-preferred-energy-source/",
    "https://www.theguardian.com/environment/2014/dec/08/solar-wind-energy-sources-huge-majority-australians-poll-shows"
]
aus_stakeholder_info = [
    "https://www.helixos.co/post/stakeholder-engagement-on-nuclear-energy-in-australia",
    "https://createdigital.org.au/30-billion-australia-asia-powerlink-project/",
    "https://www.corrs.com.au/insights/legal-and-social-licence-considerations-for-nuclear-energy-in-australia"
]
france_public_opinion = [
    "https://www.euractiv.com/section/electricity/news/most-french-want-government-to-speed-up-renewable-nuclear-development/",
    "https://www.solarplaza.com/resource/12197/way-forward-solar-france/",
    "https://www.orano.group/en/unpacking-nuclear/according-to-a-bva-study-for-orano-most-french-people-think-nuclear-energy-is-an-asset-for-france-s-energy-independence"
]
france_stakeholder_info = [
    "https://www.agenda-2030.fr/en/agenda-2030/france/article/mobilized-stakeholders"
]
singapore_public_opinion = [
    "https://www.weforum.org/agenda/2021/04/singapore-solar-floatting-farms-environment-energy-cities/",
    "https://sunproenergies.com/why-solar-is-ideal-for-singapore/",
    "https://www.nuclearbusiness-platform.com/media/insights/insights/is-nuclear-power-singapores-best-bet-for-energy-independence"
]
singapore_stakeholder_info = [
    "https://www.businesstimes.com.sg/singapore/economy-policy/vital-singapore-build-knowledge-nuclear-power-safety-dpm-wong",
    "https://www.edb.gov.sg/en/our-industries/sustainability/renewable-energy.html"
]

# Function to scrape web data, load and save it as a text file format.
def save_web_content(url_list, category_name):
    for i, url in enumerate(url_list):
        loader = WebBaseLoader(url)
        docs = loader.load()

        # Saving the content of the first document (assuming a single doc is loaded).
        content = docs[0].page_content if docs else "No content loaded"

        # Saving to a text file with a specified name pattern.
        file_name = f"{category_name}_{i + 1}.txt"
        with open(file_name, "w", encoding="utf-8") as f:
            f.write(content)
        print(f"Saved content to {file_name}")

# Iterating through all URL lists and saving content as text file.
save_web_content(aus_public_opinion, "aus_public_opinion")
save_web_content(aus_stakeholder_info, "aus_stakeholder_info")
save_web_content(france_public_opinion, "france_public_opinion")
save_web_content(france_stakeholder_info, "france_stakeholder_info")
save_web_content(singapore_public_opinion, "singapore_public_opinion")
save_web_content(singapore_stakeholder_info, "singapore_stakeholder_info")




Saved content to aus_public_opinion_1.txt
Saved content to aus_public_opinion_2.txt
Saved content to aus_public_opinion_3.txt
Saved content to aus_public_opinion_4.txt
Saved content to aus_public_opinion_5.txt
Saved content to aus_public_opinion_6.txt
Saved content to aus_stakeholder_info_1.txt
Saved content to aus_stakeholder_info_2.txt
Saved content to aus_stakeholder_info_3.txt
Saved content to france_public_opinion_1.txt
Saved content to france_public_opinion_2.txt
Saved content to france_public_opinion_3.txt
Saved content to france_stakeholder_info_1.txt
Saved content to singapore_public_opinion_1.txt
Saved content to singapore_public_opinion_2.txt
Saved content to singapore_public_opinion_3.txt
Saved content to singapore_stakeholder_info_1.txt
Saved content to singapore_stakeholder_info_2.txt


# **Step 3: Converting downloaded PDF files to Text Files using "pdfplumber" (Data related to Policies).**

In [4]:
# Installing necessary libraries.
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [9]:
# Converting PDF to Text files:
import pdfplumber

# Function ot convert PDF file to Text file.
def pdf_to_text(pdf_file, output_txt_file):
    # Opening the PDF file.
    with pdfplumber.open(pdf_file) as pdf:
        # Creating/opening the output text file in write mode.
        with open(output_txt_file, "w", encoding="utf-8") as txt_file:
            # Looping through all pages in the PDF.
            for page_num, page in enumerate(pdf.pages):
                # Extracting text from the current page.
                text = page.extract_text()

                if text:
                    # Writing the text of the page to the text file.
                    txt_file.write(f"Page {page_num + 1}\n")
                    txt_file.write(text)
                    txt_file.write("\n\n")  # Adding space between pages.
                else:
                    txt_file.write(f"Page {page_num + 1} has no text.\n")

    print(f"Text extracted and saved to {output_txt_file}")

# Function to convert multiple PDF files instead of converting one by one.
def convert_multiple_pdfs(pdf_list):
    for pdf_file in pdf_list:
        # Creating an output text file name based on the PDF file name.
        output_txt_file = pdf_file.replace(".pdf", ".txt")

        # Converting the PDF to text.
        pdf_to_text(pdf_file, output_txt_file)

# List of PDF documents.
pdf_list = ["Australia2023EnergyPolicyReview_extracted_paragraphs.pdf", "Australia’s network of nuclear cooperation agreements.pdf", "AustraliaPublicPolicy.pdf", "Energy_Policy_France_2016_Review_extracted_paragraphs.pdf", "singapore_nr_extracted_paragraphs.pdf", "SingaporePublicPolicy.pdf" ]

# Converting all PDFs in the list to text files.
convert_multiple_pdfs(pdf_list)


Text extracted and saved to Australia2023EnergyPolicyReview_extracted_paragraphs.txt
Text extracted and saved to Australia’s network of nuclear cooperation agreements.txt
Text extracted and saved to AustraliaPublicPolicy.txt
Text extracted and saved to Energy_Policy_France_2016_Review_extracted_paragraphs.txt
Text extracted and saved to singapore_nr_extracted_paragraphs.txt
Text extracted and saved to SingaporePublicPolicy.txt
