In [None]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
from langchain.schema import Document

base_url = "https://www.y4d.ngo"
projects_url = f"{base_url}/projects"

# Get the projects page
html = requests.get(projects_url).text
soup = BeautifulSoup(html, "html.parser")

# Collect unique project links
project_links = set()
for a in soup.find_all("a", href=True):
    if "project_details" in a['href']:
        full_url = urljoin(base_url, a['href'])
        project_links.add(full_url)

project_links = list(project_links)
print("Unique project links:", project_links)

# Convert projects directly into Document objects
project_docs = []

for link in project_links:
    detail_html = requests.get(link).text
    detail_soup = BeautifulSoup(detail_html, "html.parser")

    title_tag = detail_soup.find("h3") or detail_soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    desc_tag = detail_soup.find("div", class_="project-description")
    if desc_tag:
        description = desc_tag.get_text(separator="\n", strip=True)
    else:
        first_p = detail_soup.find("p")
        description = first_p.get_text(strip=True) if first_p else "No description"

    # Convert to Document with metadata
    doc = Document(
        page_content=description,
        metadata={"url": link, "title": title, "type": "project"}
    )
    project_docs.append(doc)

print(f"Total project Documents: {len(project_docs)}")


In [None]:
!pip install langchain-community



Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [None]:
from bs4 import SoupStrainer
from langchain_community.document_loaders import WebBaseLoader
from langchain.schema import Document

about_url = f"{base_url}/who_are_we"
bs4_strainer = SoupStrainer(name=("h2", "p"))  # filter by tag, not class
loader = WebBaseLoader(
    web_paths=(about_url,),
    bs_kwargs={"parse_only": bs4_strainer},
)

web_docs = []
for doc in loader.load():
    web_docs.append(Document(
        page_content=doc.page_content,
        metadata={
            "source": about_url,
            "type": "webpage"
        }
    ))

for doc in web_docs:
    print(doc.page_content)




About UsOverviewY4D Foundation is a youth led organization working on empowering the underprivileged section of our society. Y4D  has a pan India presence through its wide network of Volunteer Chapters across the country. Y4D Foundation focused its interventions on issues concerning youth and children which brought about significant changes in their lives in terms of education, health, skill, career and sustainable livelihood. Y4D also works on Environment conservation, women empowerment, Food safety and security, . Being an organisation who cares for society, Y4D gets engaged in projects as the situation demands under natural or manmade disasters, like COVID-19 Pandemic, Flood, Drought Relief etc.VisionY4D envisions fostering the development of a happy, healthy, and sustainable society in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empoweri

In [None]:
!pip install pypdf


Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/310.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/310.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.0.0


In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

# Set the path to your local PDF folder
pdf_folder = "D:/Users/Desktop/Saket/ML/Hackathon/PDFs"

# Get all PDF files in the folder
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

all_docs = []  # master list to store all PDF pages

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pdf_pages = loader.load()  # load pages of this PDF
    
    # Add each page as a Document with metadata
    for page in pdf_pages:
        all_docs.append(Document(
            page_content=page.page_content,
            metadata={
                "source": "https://www.y4d.ngo/newsletters",      # local file path
                "file_name": os.path.basename(pdf),  # just the file name
                "type": "pdf"
            }
        ))

print(f"✅ Total pages loaded from PDFs: {len(all_docs)}")


Total pages: 128


In [None]:
all_rag_docs = web_docs + all_docs + project_docs
print(f"Total documents for RAG: {len(all_rag_docs)}")


Total documents for RAG: 151


In [None]:
print(f"Total characters: {len(all_rag_docs[0].page_content)}")

Total characters: 1916


SPLITTING INTO CHUNKS AND EMBEDDINGS

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(all_rag_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 322 sub-documents.


CHROMADB

In [None]:
!pip install -qU langchain-huggingface

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
!pip install -qU "langchain-chroma>=0.1.2"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m85.5 MB/s[0m eta [36m0:00:

In [None]:
import shutil

shutil.rmtree("./chroma_langchain_db", ignore_errors=True)


In [None]:
from langchain_chroma import Chroma


vectorstore = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory="./chroma_langchain_db",
)


In [None]:
vectorstore.add_documents(all_splits)

print("✅ Documents added to vectorstore!")

✅ Documents added to vectorstore!


In [None]:
!ls -lh ./chroma_langchain_db


total 4.0M
drwxr-xr-x 2 root root 4.0K Aug 21 08:50 7321bdba-25dc-4ef4-9a24-0717e5f1eb2b
-rw-r--r-- 1 root root 4.1M Aug 21 08:50 chroma.sqlite3


In [None]:

print(f"Vectors stored: {vectorstore._collection.count()}")

Vectors stored: 322
