In [44]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def get_category_links(base_url="https://volunteerinfo.org"):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")

    category_li = soup.find("li", id="categories-3")
    category_links = []
    if category_li:
        for a_tag in category_li.find_all("a"):
            name = a_tag.text.strip()
            href = a_tag.get("href")
            if href:
                category_links.append((name, href))
    return category_links

def scrape_category(category_name, base_url):
    nonprofits = {}

    page = 1
    while True:
        if page == 1:
            url = base_url
        else:
            url = f"{base_url}page/{page}/"

        response = requests.get(url)
        if response.status_code != 200:
            print(f"Reached end of pages for {category_name}")
            break

        soup = BeautifulSoup(response.content, "html.parser")
        titles = soup.find_all("h2", class_="entry-title")
        summaries = soup.find_all("div", class_="entry-summary")

        if not titles:
            break

        for title_tag, summary_tag in zip(titles, summaries):
            a_tag = title_tag.find("a")
            name = a_tag.text.strip()
            link = a_tag['href']
            p_tag = summary_tag.find("p")
            summary = p_tag.text.strip() if p_tag else ""

            if link in nonprofits:
                if category_name not in nonprofits[link]["Category"]:
                    nonprofits[link]["Category"].append(category_name)
            else:
                nonprofits[link] = {
                    "Name": name,
                    "URL": link,
                    "Summary": summary,
                    "Category": [category_name]
                }

        page += 1
        time.sleep(0.2)

    return nonprofits

# Run Full Scrape
all_nonprofits = {}

categories = get_category_links()
for cat_name, cat_url in categories:
    scraped = scrape_category(cat_name, cat_url)
    for link, info in scraped.items():
        if link in all_nonprofits:
            existing = all_nonprofits[link]
            for cat in info["Category"]:
                if cat not in existing["Category"]:
                    existing["Category"].append(cat)
        else:
            all_nonprofits[link] = info

# Convert to DataFrame
data = []
for org in all_nonprofits.values():
    data.append({
        "Name": org["Name"],
        "URL": org["URL"],
        "Summary": org["Summary"],
        "Category": ", ".join(org["Category"])
    })

df = pd.DataFrame(data)
df.to_csv("all_nonprofits_volunteerinfo.csv", index=False)
print(df.head())


Reached end of pages for Abuse and Neglect
Reached end of pages for Adoption and Foster Care
Reached end of pages for Advocacy and Human Rights
Reached end of pages for Agriculture
Reached end of pages for Alameda
Reached end of pages for Albany
Reached end of pages for Animals
Reached end of pages for Arts and Culture
Reached end of pages for Assisted Living
Reached end of pages for Bakersfield
Reached end of pages for Berkeley
Reached end of pages for Board Development
Reached end of pages for Burlingame
Reached end of pages for Castro Valley
Reached end of pages for Central Valley
Reached end of pages for Children and Families
Reached end of pages for Civic Engagement
Reached end of pages for Communications Access
Reached end of pages for Community Development
Reached end of pages for Computers and Technology
Reached end of pages for Concord
Reached end of pages for Conflict Resolution
Reached end of pages for Consumer Protection
Reached end of pages for Contra Costa County
Reached 

In [4]:
import pandas as pd
df = pd.read_csv("all_nonprofits.csv")
df.head(10)

Unnamed: 0,Name,URL,Summary,Category
0,Sexual Assault & Domestic Violence Response an...,https://volunteerinfo.org/sexual-assault-domes...,The Sexual Assault & Domestic Violence Respons...,"Abuse and Neglect, Advocacy and Human Rights, ..."
1,There With Care of the Bay Area,https://volunteerinfo.org/there-with-care-of-t...,Description of Organization and/or Volunteer O...,"Abuse and Neglect, Advocacy and Human Rights, ..."
2,Little Brothers -Friends of the Elderly San Fr...,https://volunteerinfo.org/little-brothers-frie...,Little Brothers – Friends of the Elderly (LBFE...,"Abuse and Neglect, Civic Engagement, Housing a..."
3,The Living Room,https://volunteerinfo.org/the-living-room/,"We offer food services, transitional housing, ...","Abuse and Neglect, Children and Families, Civi..."
4,Care Through Touch Institute,https://volunteerinfo.org/care-through-touch-i...,Description of Organization and/or Volunteer O...,"Abuse and Neglect, Community Development, Coun..."
5,Project Delta View Cats,https://volunteerinfo.org/project-delta-view-c...,Project Delta View Cats is a 100 % volunteer-l...,"Abuse and Neglect, Animals, Central Valley, Ea..."
6,Shelter Tech,https://volunteerinfo.org/shelter-tech/,"Shelter Tech aims to build the most reliable, ...","Abuse and Neglect, Advocacy and Human Rights, ..."
7,Circle of Care,https://volunteerinfo.org/circle-of-care/,Playful and compassionate volunteers needed to...,"Abuse and Neglect, Children and Families, Coun..."
8,Safe & Sound,https://volunteerinfo.org/safe-sound/,OUR MISSION To prevent child abuse and reduce ...,"Abuse and Neglect, Adoption and Foster Care, A..."
9,Bay Area Crisis Nursery,https://volunteerinfo.org/bay-area-crisis-nurs...,Bay Area Crisis Nursery is a gateway organizat...,"Abuse and Neglect, Children and Families, Conc..."


In [1]:
from sentence_transformers import SentenceTransformer, util
# Load SBERT model (you can also try 'paraphrase-MiniLM-L6-v2' for speed)
model = SentenceTransformer("all-MiniLM-L6-v2")

# Encode all nonprofit summaries into embeddings
summary_embeddings = model.encode(df["Summary"].fillna("").tolist(), convert_to_tensor=True)

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'df' is not defined

In [None]:
import numpy as np

embeddings_np = summary_embeddings.cpu().numpy()

np.save("summary_embeddings.npy", embeddings_np)