In [None]:
!pip install selenium
!pip install groq

# AI Agent

Initializing the agent that will be responsible to extract the authors names

In [2]:
from groq import Groq
from google.colab import userdata

In [3]:
model_id = "llama-3.1-8b-instant"
api_key = userdata.get("GROQ_API_KEY")

In [4]:
client = Groq(api_key=api_key)

# Setting Selenium WebDriver

In [99]:
from selenium.webdriver.common.by import By
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep

In [100]:
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--diable-dve-shm-usage")

In [101]:
wd = webdriver.Chrome(options=options)

In [102]:
def run_groq(content: str) -> str:
    completition = client.chat.completions.create(
    messages=[
            {
                "role": "user",
                "content": content,
            }
        ],
        model=model_id,
    )

    return completition.choices[0].message.content

# Page Scrapper

In [48]:
from abc import abstractmethod

In [66]:
class SEPAContent():
    resource_name: str
    discription: str
    source: str
    resource_type: str
    authors: str

In [67]:
class PageScrapper():
    @abstractmethod
    def run(self, links: list[str]) -> list[str]:
        raise NotImplementedError

In [68]:
class SEPACaseStudiePageScrapper(PageScrapper):
    def run(self, links: list[str]) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            content.resource_name = soup.find("meta", {"property": "og:title"})["content"].replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.resource_type = soup.find("meta", {"property": "og:type"})["content"]
            content.source = link

            contents.append(content.__dict__)

            sleep(10)

        return contents

In [69]:
class SEPAReportPageScrapper(PageScrapper):
    def run(self, links: list[str]) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            try:
                title_type = soup.find("div", {"class": "column v-align-middle column-2"}).find("div", {"class": "content"})
                content.resource_type = title_type.find("span").text
            except:
                pass

            content.resource_name = soup.title.text.replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.source = link

            authors_container = soup.find("div", {"class": "entry"})
            cleaned_authors = ""
            for c in authors_container:
                paragraph = c.get_text() + "\n"
                cleaned_authors += paragraph

            authors = run_groq(f"""
Extract for me only the authors names, they are humans not companies, from the text. Put the names in a string separated by a ', '.
If none author is found return an empty string. Return only the asked task:
{cleaned_authors}
""")

            content.authors = authors
            contents.append(content.__dict__)

            sleep(10)

        return contents

In [70]:
class SEPAWhitePaperPageScrapper(PageScrapper):
    def run(self, links: list[str]) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            try:
                title_type = soup.find("div", {"class": "column v-align-middle column-2"}).find("div", {"class": "content"})
                content.resource_type = title_type.find("span").text
            except:
                pass

            content.resource_name = soup.title.text.replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.source = link


            authors_container = soup.find("div", {"class": "entry"})
            cleaned_authors = ""
            for c in authors_container:
                paragraph = c.get_text() + "\n"
                cleaned_authors += paragraph

            authors = run_groq(f"""
Extract for me only the authors names, they are humans not companies, from the text. Put the names in a string separated by a ', '.
If none author is found return an empty string. Return only the asked task:
{cleaned_authors}
""")
            content.authors = authors
            contents.append(content.__dict__)

            sleep(10)

        return contents

# Getting the Content

In [103]:
def get_links_to_scrappe(url: str, limit: int | None = None) -> list[str]:
    wd.get(url)

    try:
        last_page = wd.find_element(By.CSS_SELECTOR, "a[class='facetwp-page last']")
        last_page = int(last_page.text)
    except:
        last_page = 1

    page_links = []
    for page in range(0, last_page):
        wd.get(url + f"&_paged={page + 1}")

        html = wd.page_source
        soup = BeautifulSoup(html, "html.parser")

        links = soup.find_all("a", {"class": "button button-small cta"})
        for link in links:
            page_links.append(link["href"])

        sleep(4)

    print(f"\nIn the page {url}, {len(page_links) if not limit else limit} links were found")

    if limit and len(page_links) >= limit:
        return page_links[:limit]
    else:
        return page_links

In [104]:
class PageToScrappe():
    def __init__(self, page_scrapper: PageScrapper, url: str):
        self.url = url
        self.page_scrapper = page_scrapper

    def run(self, links: list[str]) -> list[str]:
        return self.page_scrapper.run(links)


In [105]:
pages_to_scrappe = [
    PageToScrappe(SEPACaseStudiePageScrapper(), "https://sepapower.org/knowledge/?_type=case-study&_publication_period=last-5-years"),
    PageToScrappe(SEPAReportPageScrapper(), "https://sepapower.org/knowledge/?_type=report&_publication_period=last-5-years"),
    PageToScrappe(SEPAWhitePaperPageScrapper(), "https://sepapower.org/knowledge/?_type=white-paper&_publication_period=last-5-years"),
]

In [106]:
scrapped_content = []
LIMIT = 3

In [107]:
for page in pages_to_scrappe:
    links = get_links_to_scrappe(page.url, LIMIT)
    contents = page.run(links)
    for content in contents:
        scrapped_content.append(content)

In the page https://sepapower.org/knowledge/?_type=report&_publication_period=last-5-years ,  3 links were found
SCRAPPING https://sepapower.org/resource/application-guide-sepa-interoperability-profile-for-electric-vehicle-fleet-managed-charging-utilizing-ieee-2030-5-2018/
SCRAPPING https://sepapower.org/resource/50-states-of-virtual-power-plant-and-supporting-distributed-energy-resources-2024-state-policy-snapshot/
SCRAPPING https://sepapower.org/resource/advancing-building-electrification-utility-case-studies/
In the page https://sepapower.org/knowledge/?_type=white-paper&_publication_period=last-5-years ,  3 links were found
SCRAPPING https://sepapower.org/resource/decoding-derms-options-for-the-future-of-der-management/
SCRAPPING https://sepapower.org/resource/conformity-assessment-for-smart-grid-electromagnetic-compatibility-emc/
SCRAPPING https://sepapower.org/resource/benchmarking-equitable-transportation-electrification/


In [108]:
import pandas as pd

In [109]:
df = pd.DataFrame.from_dict(scrapped_content)

In [110]:
df.to_csv("SEPA_data.csv", index=False)