In [267]:
!pip install selenium
!pip install groq



# LLM

Initializing the LLM that will be responsible to extract the authors names

In [268]:
from groq import Groq
from google.colab import userdata

In [269]:
model_id = "llama-3.1-8b-instant"
api_key = userdata.get("GROQ_API_KEY")

In [270]:
client = Groq(api_key=api_key)

# Setting Selenium WebDriver

In [271]:
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.chrome.webdriver import WebDriver
from bs4 import BeautifulSoup
from time import sleep

In [272]:
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--diable-dve-shm-usage")

In [273]:
def webdriver_factory():
    while True:
        try:
            wd = webdriver.Chrome(options=options)
            yield wd
        finally:
            wd.close()

In [274]:
def run_groq(content: str) -> str:
    prompt = (f"""
Extract for me only the authors names.

IMPORTANT:
- You must extract only the human names, not e else from the text.
- The returned content should be a string with the names separated by a ', '.
- If no name is found return 'None'.
- Pay attention to return only what was asked, nothing more.
- Just stick to the content.

TEXT:

{content}
""")

    completition = client.chat.completions.create(
    messages=[
            {
                "role": "user",
                "content": prompt,
            }
        ],
        model=model_id,
    )

    return completition.choices[0].message.content

# Page Scrapper

In [275]:
from abc import abstractmethod, ABC

In [276]:
def clean_authors(authors: str) -> str:
    cleaned_authors = ""
    for c in authors:
        paragraph = c.get_text() + "\n"
        cleaned_authors += paragraph

    return cleaned_authors

In [277]:
class SEPAContent():
    resource_name: str
    discription: str
    source: str
    resource_type: str
    authors: str

In [278]:
class PageScrapper(ABC):
    @abstractmethod
    def run(self, links: list[str], wd: WebDriver, waitTime: int = 5) -> list[str]:
        raise NotImplementedError

In [279]:
class SEPAPageScrapper():
    def run(self, links: list[str], wd: WebDriver, waitTime: int = 5) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            content.resource_name = soup.find("meta", {"property": "og:title"})["content"].replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.resource_type = soup.find("meta", {"property": "og:type"})["content"]
            content.source = link

            authors_container = soup.find("div", {"class": "entry"})
            cleaned_authors = clean_authors(authors_container)
            authors = run_groq({cleaned_authors})
            content.authors = authors

            contents.append(content.__dict__)

            sleep(waitTime)

        return contents

## Separated by page

In [280]:
class SEPACaseStudiePageScrapper(PageScrapper):
    def run(self, links: list[str], wd: WebDriver, waitTime: int = 5) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            content.resource_name = soup.find("meta", {"property": "og:title"})["content"].replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.resource_type = soup.find("meta", {"property": "og:type"})["content"]
            content.source = link

            authors_container = soup.find("div", {"class": "entry"})
            cleaned_authors = clean_authors(authors_container)
            authors = run_groq({cleaned_authors})
            content.authors = authors

            contents.append(content.__dict__)

            sleep(waitTime)

        return contents

In [281]:
class SEPAReportPageScrapper(PageScrapper):
    def run(self, links: list[str], wd: WebDriver, waitTime: int = 5) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            content.resource_type = soup.find("meta", {"property": "og:type"})["content"]
            content.resource_name = soup.find("meta", {"property": "og:title"})["content"].replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.source = link

            authors_container = soup.find("div", {"class": "entry"})
            cleaned_authors = clean_authors(authors_container)

            authors = run_groq({cleaned_authors})

            content.authors = authors
            contents.append(content.__dict__)

            sleep(waitTime)

        return contents

In [282]:
class SEPAWhitePaperPageScrapper(PageScrapper):
    def run(self, links: list[str], wd: WebDriver, waitTime: int = 5) -> list[SEPAContent]:
        contents = []
        for link in links:
            print(f"SCRAPPING {link}")
            content = SEPAContent()

            wd.get(link)
            soup = BeautifulSoup(wd.page_source, "html.parser")

            try:
                title_type = soup.find("div", {"class": "column v-align-middle column-2"}).find("div", {"class": "content"})
                content.resource_type = title_type.find("span").text
            except:
                pass

            content.resource_name = soup.title.text.replace("| SEPA", "").strip()
            content.discription = soup.find("meta", {"name": "description"})["content"]
            content.source = link


            authors_container = soup.find("div", {"class": "entry"})
            cleaned_authors = clean_authors(authors_container)

            authors = run_groq({cleaned_authors})

            content.authors = authors
            contents.append(content.__dict__)

            sleep(waitTime)

        return contents

# Getting the Content

In [283]:
def get_links_to_scrappe(url: str, wd: WebDriver, limit: int | None = None) -> list[str]:
    wd.get(url)

    try:
        last_page = wd.find_element(By.CSS_SELECTOR, "a[class='facetwp-page last']")
        last_page = int(last_page.text)
    except:
        last_page = 1

    page_links = []
    for page in range(0, last_page):
        wd.get(url + f"&_paged={page + 1}")

        html = wd.page_source
        soup = BeautifulSoup(html, "html.parser")

        links = soup.find_all("a", {"class": "button button-small cta"})
        for link in links:
            page_links.append(link["href"])

        sleep(4)

    print(f"\nIn the page {url}, {len(page_links) if not limit else limit} links were found")

    if limit and len(page_links) >= limit:
        return page_links[:limit]
    else:
        return page_links

In [284]:
class PageToScrappe():
    def __init__(self, page_scrapper: PageScrapper, url: str):
        self.url = url
        self.page_scrapper = page_scrapper


In [285]:
pages_to_scrappe = [
    PageToScrappe(SEPAPageScrapper(), "https://sepapower.org/knowledge/?_type=case-study&_publication_period=last-5-years"),
    PageToScrappe(SEPAPageScrapper(), "https://sepapower.org/knowledge/?_type=report&_publication_period=last-5-years"),
    PageToScrappe(SEPAPageScrapper(), "https://sepapower.org/knowledge/?_type=white-paper&_publication_period=last-5-years"),
]

In [286]:
scrapped_content = []
LIMIT = 3

In [287]:
wd = webdriver_factory()

In [288]:
for page in pages_to_scrappe:
    driver = next(wd)
    links = get_links_to_scrappe(page.url, driver, LIMIT)
    contents = page.page_scrapper.run(links, driver)
    for content in contents:
        scrapped_content.append(content)

KeyboardInterrupt: 

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame.from_dict(scrapped_content)

In [None]:
df.to_csv("SEPA_data.csv", index=False)