In [112]:
import os
import textwrap
import asyncio
import nest_asyncio
from bs4 import BeautifulSoup
from markdownify import markdownify as md
from dotenv import load_dotenv
import litellm
from playwright.async_api import async_playwright

# Patch nested loops for Jupyter compatibility
nest_asyncio.apply()

# Load API key
load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
    raise ValueError("Missing GROQ_API_KEY")

litellm.api_key = GROQ_API_KEY
litellm.provider = "groq"

# Async scraper
async def scrape_markdown_from_url_async(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url, wait_until="networkidle")
        html = await page.content()
        await browser.close()

    soup = BeautifulSoup(html, "html.parser")
    return md(str(soup.body))

# Run async function in any environment
def scrape_markdown_from_url(url):
    return asyncio.get_event_loop().run_until_complete(scrape_markdown_from_url_async(url))

def chunk_markdown(markdown, chunk_size=3000):
    return textwrap.wrap(markdown, width=chunk_size)

In [113]:
import tiktoken

def count_tokens(text, model="groq/deepseek-r1-distill-llama-70b"):
    try:
        enc = tiktoken.encoding_for_model(model)
    except KeyError:
        enc = tiktoken.get_encoding("cl100k_base")  # fallback

    return len(enc.encode(text))

In [114]:
def extract_with_groq(chunk):
    try:
        res = litellm.completion(
            model="groq/meta-llama/llama-4-scout-17b-16e-instruct",
            messages=[
                {
                    "role": "system",
                    "content": (
                        "Extract a list of professor names and emails. If there are no professors listed, don't respond with anything."
                        "Respond ONLY with lines in the format: Full Name, Email Address. If there are no professors listed, don't respond with anything."
                        "Do not explain or add any extra text. If there are no professors listed, don't respond with anything."
                    )
                },
                {
                    "role": "user",
                    "content": (
                        "From the following text, extract names and emails for all Professors, Associate Professors, or Assistant Professors.If there are no professors listed, don't respond with anything.\n\n"
                        f"{chunk}"
                    )
                }
            ],
            max_tokens=512,
            temperature=0
        )
        return res["choices"][0]["message"]["content"].strip()
    except Exception as e:
        return f"[ERROR]: {e}"


In [115]:
import csv

def append_string_to_csv(data_string, filename="professors.csv"):
    with open(filename, mode="a", newline='') as csvfile:
        writer = csv.writer(csvfile)
        lines = data_string.strip().split("\n")
        for line in lines:
            if ',' in line:
                name, email = map(str.strip, line.split(",", 1))
                if email and "edu" in email:
                    writer.writerow([name, email])


In [116]:
def extract_emails_from_markdown(url, filename):
        markdown = scrape_markdown_from_url(url)
        print(markdown)
        chunks = chunk_markdown(markdown)

        results = ""
        for i, chunk in enumerate(chunks):
            print(f"→ Processing chunk {i + 1}/{len(chunks)}")
            result = extract_with_groq(chunk)
            print(result)
            results +=result
            results+="\n"
        result = results
        print(result)
        append_string_to_csv(result, filename)

In [None]:
# MAKE SURE TO HAVE HTTPS WWW section in URL
url="https://www.simmons.edu/academics/faculty?school_department=All&college=All&full_time=1"
file = "simmons_professors.csv"
extract_emails_from_markdown(url, file)

[Skip to main content](#main-content)


[![Simmons University Logo](/themes/simmons_2023/assets/img/logo.svg)](/ "Simmons University Home")



* [Directory](/directory)
* [Athletics](http://athletics.simmons.edu)
* [News](/news)
* [Events](/events)
* [Library](/library)
* [Give](https://engage.simmons.edu/register/giving)

* Request Info
* Visit
* Apply

* Information For
  undefined
  + [Admitted Undergraduates](/undergraduate/admission-and-financial-aid/admitted-undergraduates)
  + [Admitted Graduate Students](/graduate/admission/admitted-graduate-students)
  + [Current Students](https://internal.simmons.edu/students)
  + [Current Graduate Students](/graduate/graduate-student-resources)
  + [International Students](/go/international-services)
  + [Veterans](/go/veterans-info)
  + [Alumnae/i and Friends](/alumni)
  + [Parents and Families](/undergraduate/admission-and-financial-aid/parents-and-families)
  + [Faculty and Staff](https://internal.simmons.edu/faculty-staff)
  + [College C