In [1]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
from langchain.schema import Document

base_url = "https://www.y4d.ngo"
projects_url = f"{base_url}/projects"

# Get the projects page
html = requests.get(projects_url).text
soup = BeautifulSoup(html, "html.parser")

# Collect unique project links
project_links = set()
for a in soup.find_all("a", href=True):
    if "project_details" in a['href']:
        full_url = urljoin(base_url, a['href'])
        project_links.add(full_url)

project_links = list(project_links)
print("Unique project links:", project_links)

# Convert projects directly into Document objects
project_docs = []

for link in project_links:
    detail_html = requests.get(link).text
    detail_soup = BeautifulSoup(detail_html, "html.parser")

    title_tag = detail_soup.find("h3") or detail_soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "No title"

    desc_tag = detail_soup.find("div", class_="project-description")
    if desc_tag:
        description = desc_tag.get_text(separator="\n", strip=True)
    else:
        first_p = detail_soup.find("p")
        description = first_p.get_text(strip=True) if first_p else "No description"

    # Convert to Document with metadata
    doc = Document(
        page_content=description,
        metadata={"url": link, "title": title, "type": "project"}
    )
    project_docs.append(doc)

print(f"Total project Documents: {len(project_docs)}")


Unique project links: ['https://www.y4d.ngo/project_details/2', 'https://www.y4d.ngo/project_details/18', 'https://www.y4d.ngo/project_details/1', 'https://www.y4d.ngo/project_details/7', 'https://www.y4d.ngo/project_details/5', 'https://www.y4d.ngo/project_details/16', 'https://www.y4d.ngo/project_details/20', 'https://www.y4d.ngo/project_details/12', 'https://www.y4d.ngo/project_details/22', 'https://www.y4d.ngo/project_details/14', 'https://www.y4d.ngo/project_details/9', 'https://www.y4d.ngo/project_details/19', 'https://www.y4d.ngo/project_details/10', 'https://www.y4d.ngo/project_details/23', 'https://www.y4d.ngo/project_details/21', 'https://www.y4d.ngo/project_details/17', 'https://www.y4d.ngo/project_details/13', 'https://www.y4d.ngo/project_details/24', 'https://www.y4d.ngo/project_details/8', 'https://www.y4d.ngo/project_details/15', 'https://www.y4d.ngo/project_details/3', 'https://www.y4d.ngo/project_details/6']
Total project Documents: 22


In [2]:
!pip install langchain-community



Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading mypy_extensions-1.1.0-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_community-0.3.27-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dataclasses_json-0.6.7-py3-none-any.whl (

In [3]:
from bs4 import SoupStrainer
from langchain_community.document_loaders import WebBaseLoader
from langchain.schema import Document

about_url = f"{base_url}/who_are_we"
bs4_strainer = SoupStrainer(name=("h2", "p"))  # filter by tag, not class
loader = WebBaseLoader(
    web_paths=(about_url,),
    bs_kwargs={"parse_only": bs4_strainer},
)

web_docs = []
for doc in loader.load():
    web_docs.append(Document(
        page_content=doc.page_content,
        metadata={
            "source": about_url,
            "type": "webpage"
        }
    ))

for doc in web_docs:
    print(doc.page_content)




About UsOverviewY4D Foundation is a youth led organization working on empowering the underprivileged section of our society. Y4D  has a pan India presence through its wide network of Volunteer Chapters across the country. Y4D Foundation focused its interventions on issues concerning youth and children which brought about significant changes in their lives in terms of education, health, skill, career and sustainable livelihood. Y4D also works on Environment conservation, women empowerment, Food safety and security, . Being an organisation who cares for society, Y4D gets engaged in projects as the situation demands under natural or manmade disasters, like COVID-19 Pandemic, Flood, Drought Relief etc.VisionY4D envisions fostering the development of a happy, healthy, and sustainable society in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empoweri

In [4]:
!pip install pypdf


Collecting pypdf
  Downloading pypdf-6.0.0-py3-none-any.whl.metadata (7.1 kB)
Downloading pypdf-6.0.0-py3-none-any.whl (310 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/310.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/310.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.5/310.5 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-6.0.0


In [6]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document

pdf_folder = "PDFs"
pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith(".pdf")]
all_docs = []  # master list to store everything

for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pdf_pages = loader.load()      # docs = pages of this PDF only
    all_docs.extend(pdf_pages)     # add these pages to the master list
    for page in pdf_pages:
        all_docs.append(Document(
            page_content=page.page_content,
            metadata={
                "source": "https://www.y4d.ngo/newsletters",
                "file_name": pdf,
                "type": "pdf"
            }
        ))
print(f"Total pages: {len(all_docs)}")

Total pages: 128


In [7]:
all_rag_docs = web_docs + all_docs + project_docs
print(f"Total documents for RAG: {len(all_rag_docs)}")


Total documents for RAG: 151


In [8]:
print(f"Total characters: {len(all_rag_docs[0].page_content)}")

Total characters: 1916


SPLITTING INTO CHUNKS AND EMBEDDINGS

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(all_rag_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 322 sub-documents.


CHROMA AND RETRIVAL


In [10]:
!pip install -qU langchain-huggingface

In [11]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
!pip install -qU "langchain-chroma>=0.1.2"

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.8/19.8 MB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.2/284.2 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m61.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m49.6 MB/s[0m eta [36m0:00:

In [13]:
import shutil

shutil.rmtree("./chroma_langchain_db", ignore_errors=True)


In [14]:
from langchain_chroma import Chroma


vectorstore = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings_model,
    persist_directory="./chroma_langchain_db",
)


In [15]:
vectorstore.add_documents(all_splits)

print("✅ Documents added to vectorstore!")

✅ Documents added to vectorstore!


In [16]:
!ls -lh ./chroma_langchain_db


total 4.0M
drwxr-xr-x 2 root root 4.0K Aug 21 13:25 99f16150-bea0-452b-a65d-4027fb5df1e5
-rw-r--r-- 1 root root 4.1M Aug 21 13:25 chroma.sqlite3


In [17]:

print(f"Vectors stored: {vectorstore._collection.count()}")

Vectors stored: 322


In [18]:
import shutil
shutil.make_archive("chroma_db_backup", 'zip', "./chroma_langchain_db")


'/content/chroma_db_backup.zip'

In [19]:
retriever = vectorstore.as_retriever(search_type = "similarity",search_kwargs = {"k": 3})
retriever

VectorStoreRetriever(tags=['Chroma', 'HuggingFaceEmbeddings'], vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x7b8b5c2121b0>, search_kwargs={'k': 3})

In [20]:
retriever.invoke('What does Y4D do?')

[Document(id='1e6ed9a2-9bc8-481c-bf62-9bc6230dd05a', metadata={'source': 'https://www.y4d.ngo/who_are_we', 'type': 'webpage', 'start_index': 798}, page_content='in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empowering them through encouragement, education, and employment. Y4D strives to encourage individuals to reach their full potential, lead happy, healthy lives with dignity and make them capable of making meaningful contributions to society. Y4D is shaping up a sustainable society for future generations by instilling futuristic ideals into our practices.ValuesGoalsMilestones & AwardsOur TeamY4D is Committed to Empower the Underprivileged sections of society and ultimately build a healthy, happy and progressive society where every citizen’s lives with dignity and gets equal opportunities, we had started the journey towards our goal by tak

AUGMENTATION


In [21]:
!pip install huggingface_hub



In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,
    temperature=0.2
)
llm = HuggingFacePipeline(pipeline=pipe)


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [82]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate(
    template = """
    You are a helpful assistant for Y4D Foundation.
    Y4D is an NGO which does Child Upliftment, Youth Empowerment and CSR Partnerships
    Answer ONLY from the provided transcript context.
    Answer ONLY in 3 lines
    If the context is insufficient, just say you don't know.

    Context: {context}
    Question: {question}
    Answer:""",
    input_variables = ['context', 'question']
)

In [35]:
question          = "What are the main initiatives Y4D Foundation is running to empower youth?"
retrieved_docs    = retriever.invoke(question)

In [36]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

'in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empowering them through encouragement, education, and employment. Y4D strives to encourage individuals to reach their full potential, lead happy, healthy lives with dignity and make them capable of making meaningful contributions to society. Y4D is shaping up a sustainable society for future generations by instilling futuristic ideals into our practices.ValuesGoalsMilestones & AwardsOur TeamY4D is Committed to Empower the Underprivileged sections of society and ultimately build a healthy, happy and progressive society where every citizen’s lives with dignity and gets equal opportunities, we had started the journey towards our goal by taking smaller steps towards it, I would request everyone to join us in empowerment journey towards better future.AdvisorsLegal StatusReportsY4D Foundation, 402,\n

In [37]:
retrieved_docs

[Document(id='1e6ed9a2-9bc8-481c-bf62-9bc6230dd05a', metadata={'type': 'webpage', 'source': 'https://www.y4d.ngo/who_are_we', 'start_index': 798}, page_content='in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empowering them through encouragement, education, and employment. Y4D strives to encourage individuals to reach their full potential, lead happy, healthy lives with dignity and make them capable of making meaningful contributions to society. Y4D is shaping up a sustainable society for future generations by instilling futuristic ideals into our practices.ValuesGoalsMilestones & AwardsOur TeamY4D is Committed to Empower the Underprivileged sections of society and ultimately build a healthy, happy and progressive society where every citizen’s lives with dignity and gets equal opportunities, we had started the journey towards our goal by tak

In [38]:
final_prompt = prompt.invoke({"context": context_text, "question": question})

In [39]:
final_prompt

StringPromptValue(text="\n    You are a helpful assistant for Y4D Foundation.\n    Y4D is an NGO which does Child Upliftment, Youth Empowerment and CSR Partnerships\n    Answer ONLY from the provided transcript context.\n    If the context is insufficient, just say you don't know.\n\n    in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empowering them through encouragement, education, and employment. Y4D strives to encourage individuals to reach their full potential, lead happy, healthy lives with dignity and make them capable of making meaningful contributions to society. Y4D is shaping up a sustainable society for future generations by instilling futuristic ideals into our practices.ValuesGoalsMilestones & AwardsOur TeamY4D is Committed to Empower the Underprivileged sections of society and ultimately build a healthy, happy and progressive s

GENERATION

In [57]:
final_prompt_str = str(final_prompt)
answer = llm.invoke(final_prompt_str,stop=["\nQuestion:", "\nContext:"])
print(answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


text="\n    You are a helpful assistant for Y4D Foundation.\n    Y4D is an NGO which does Child Upliftment, Youth Empowerment and CSR Partnerships\n    Answer ONLY from the provided transcript context.\n    If the context is insufficient, just say you don't know.\n\n    in which every individual has an equal opportunity for growth and a life of dignity.MissionY4D is a youth-led futuristic organization committed to empowering the economically underprivileged by empowering them through encouragement, education, and employment. Y4D strives to encourage individuals to reach their full potential, lead happy, healthy lives with dignity and make them capable of making meaningful contributions to society. Y4D is shaping up a sustainable society for future generations by instilling futuristic ideals into our practices.ValuesGoalsMilestones & AwardsOur TeamY4D is Committed to Empower the Underprivileged sections of society and ultimately build a healthy, happy and progressive society where every

In [60]:
answer_only = answer.split("Answer:")[-1].strip()
clean_answer = answer_only.replace("\n", " ").strip()
print(clean_answer)


"\n    Y4D Foundation is running various initiatives to empower youth. Some of the initiatives are:\n    1. Y4D Foundation is running a skill development program for youth to help them get a job. This program is called Y4D Foundation Skill Development Program.\n    2. Y4D Foundation is running a health awareness program to help youth understand the importance of good health and nutrition. This program is called Y4D Foundation Health Awareness Program.\n    3. Y4D Foundation is running a mental health awareness program to help youth understand the importance of mental well-being and how to cope with stress. This program is called Y4D Foundation Mental Health Awareness Program.\n    4. Y4D Foundation is running a financial literacy program to help youth understand the importance of financial planning and how to manage their finances. This program is called Y4D Foundation Financial Literacy Program.\n    5. Y4D Foundation is running a career guidance


BUILDING A CHAIN

In [42]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [43]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [44]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [63]:
def clean_output(answer_text):
    lines = [line.strip() for line in answer_text.split("\n") if line.strip()]
    return "\n".join(lines)

In [80]:
def AnswerOnlyParser(text):
      if "Setting `pad_token_id`" in text:
        raw_output = raw_output.split("\n", 1)[1]
      if "Answer:" in text:
          return text.split("Answer:")[-1].strip()
      return text.strip()


In [71]:
parser = StrOutputParser()

In [76]:
main_chain = parallel_chain | prompt | llm

In [83]:
raw = main_chain.invoke('What are the main initiatives Y4D Foundation is running to empower youth?')
clean = AnswerOnlyParser(raw)
print(clean)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Y4D Foundation is running various initiatives to empower youth. Some of the initiatives are as follows:
    1. Y4D Foundation is running a skill development program for youth. This program aims to provide youth with the necessary skills and knowledge to pursue their career goals. The program is designed to be flexible and adaptable to the needs of the youth, ensuring that they can learn at their own pace and in a way that suits their learning style.
    2. Y4D Foundation is also running a health awareness program for youth. This program aims to raise awareness about various health issues, such as nutrition, lifestyle diseases, and mental well-being. The program is designed to be interactive and engaging, with activities that are fun and easy to understand.
    3. Y4D Foundation is also running a financial literacy program for youth. This program aims to teach youth about financial management, budgeting, and saving. The program is designed to be practical and hands-on, with activities t