In [5]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
#Extract data from the PDF Document
def load_pdf_file(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [7]:
extracted_data = load_pdf_file(data="../Data")

In [8]:
extracted_data

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-05-19T05:14:11+08:00', 'author': 'RAOHA BIN MEZBA', 'moddate': '2025-05-19T05:14:11+08:00', 'source': '../Data/EcoHusk_Report.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='EcoHusk Project Report \n \n \n1. Project Background \nThe idea of our project EcoHusk started during my bachelor’s study when I participated in a \ncompetition at my university. It was organized by a Chinese electricity production company, \nand we were asked to write a report about the electricity production scenario of our home \ncountry. While researching, I found something very surprising about Bangladesh: even \nthough more than 95% of people have access to electricity, over 90% of our electricity still \ncomes from fossil fuels like coal and gas. These energy sources are harmful to the \nenvironment because they release a lot of pollution and carbon dioxide. Also they are not 

In [None]:
#Spliting data into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 30)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks = text_split(extracted_data)
print(f"Total number of chunks: {len(text_chunks)}")

Total number of chunks: 42


In [11]:
text_chunks

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-05-19T05:14:11+08:00', 'author': 'RAOHA BIN MEZBA', 'moddate': '2025-05-19T05:14:11+08:00', 'source': '../Data/EcoHusk_Report.pdf', 'total_pages': 9, 'page': 0, 'page_label': '1'}, page_content='EcoHusk Project Report \n \n \n1. Project Background \nThe idea of our project EcoHusk started during my bachelor’s study when I participated in a \ncompetition at my university. It was organized by a Chinese electricity production company, \nand we were asked to write a report about the electricity production scenario of our home \ncountry. While researching, I found something very surprising about Bangladesh: even'),
 Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-05-19T05:14:11+08:00', 'author': 'RAOHA BIN MEZBA', 'moddate': '2025-05-19T05:14:11+08:00', 'source': '../Data/EcoHusk_Report.pdf', 'total_pages': 9, 'page'

In [None]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load API key from environment
load_dotenv()
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

# Create Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Create index if it doesn't exist
index_name = "ecohusk"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

# Connect to the index
index = pc.Index(index_name)


In [12]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedded_texts = embeddings_model.embed_documents([chunk.page_content for chunk in text_chunks])

In [None]:
# Prepare items as (id, vector, metadata) tuples
items = [
    {
        "id": f"chunk-{i}",
        "values": embedded_texts[i],
        "metadata": {"text": text_chunks[i].page_content}
    }
    for i in range(len(text_chunks))
]

# Upsert in batches
index.upsert(vectors=items)

{'upserted_count': 42}

In [14]:
# #downloading embedding model (Method-2)
# from langchain_community.embeddings import HuggingFaceEmbeddings

# def download_hugging_face_embeddings():
#     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#     return embeddings

In [13]:
# (Method-2)
# embeddings= download_hugging_face_embeddings()

In [54]:
# Load Existing Index
from langchain_pinecone import PineconeVectorStore

docsearch= PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings_model,
)

In [52]:
retriver= docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [34]:
retrieved_docs= retriver.invoke("What is EcoHusk?")

In [35]:
retrieved_docs

[Document(id='chunk-28', metadata={}, page_content='has huge local and national potential. \nIn summary, EcoHusk stands out because it’s not just about producing energy — it’s about doing it \ncleanly, efficiently, and sustainably, while also creating useful by-products that support other \nindustries. It’s an all-in-one solution for a greener future. \n \n5. Technical Achievements \nSo far, our team has successfully completed the initial research and development phase of the'),
 Document(id='chunk-7', metadata={}, page_content='market, implementing this project here is very much possible and cost-effective. \nIn short, EcoHusk is a powerful and practical solution that turns agricultural waste into \nvaluable resources, supports sustainability, reduces pollution, and promotes a cleaner future \nfor both Bangladesh, China, and many other rice-producing countries. \n \n2. Research & Development Process \nOnce the idea of EcoHusk took shape, I shared it with my team members, and we began 

In [36]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [37]:
from langchain_openai import OpenAI
llm= OpenAI(temperature=0.4, max_tokens=500)

In [38]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an expert assistant for answering questions about the EcoHusk Project. "
    "Use the provided context from EcoHusk documents and your own knowledge to answer the question. "
    "If you are asked about any data or information you can use the reference of the EcoHusk documents to answer. But avoid answering something completely new or which does not belongs to EcoHusk documents. "
    "If the answer is not in the context, say 'I'm sorry, I'm not aware of this matter. Please contact with the Project Leader. \nContact Details:\nRAOHA BIN MEJBA (李一含)\nEmail: raohamejba@gmail.com' "
    "If the question is not related to EcoHusk project, say 'Sorry! I think this question is not related to EcoHusk. Please ask something else. Thank you. "
    "Be clear, factual, and limit your answer to less than 10 concise sentences.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [53]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriver, question_answer_chain)

In [43]:
response = rag_chain.invoke({"input": "What is Football?"})
print(response['answer'])


Sorry! I think this question is not related to EcoHusk. Please ask something else. Thank you.


In [44]:
response = rag_chain.invoke({"input": "What is the implementation cost of EcoHusk?"})
print(response['answer'])



The initial cost analysis shows that the full implementation of the EcoHusk system would require a total investment of around $350,000 to $500,000. This includes gasifiers, turbines, carbon activation units, water treatment facilities, drying chambers, and brick-making tools. However, compared to the long-term value it provides, this is a cost-effective investment.


In [45]:
response = rag_chain.invoke({"input": "How much water can EcoHusk purify?"})
print(response['answer'])



The EcoHusk system has the potential to purify a significant amount of water, especially in coastal regions where seawater salinity is high. However, the exact amount of water that can be purified will depend on the specific implementation and scale of the project. Further research and pilot implementation will be needed to determine the exact water purification capabilities of EcoHusk.


In [46]:
response = rag_chain.invoke({"input": "Does EcoHusk have any negative impact on the environment?"})
print(response['answer'])



EcoHusk does not have any negative impact on the environment. In fact, it is designed to be environmentally sustainable and reduce waste management costs. It also helps with water purification, which is especially important in coastal regions with high seawater salinity. EcoHusk is an all-in-one solution for a greener future and has huge local and national potential.


In [51]:
response = rag_chain.invoke({"input": "Who are the team members of EcoHusk?"})
print(response['answer'])



System: The team members of EcoHusk are not specified in the provided context. Please contact the Project Leader, RAOHA BIN MEJBA, for more information.


In [48]:
response = rag_chain.invoke({"input": "What is the age of RAOHA BIN MEJBA?"})
print(response['answer'])



I'm sorry, I'm not aware of this matter. Please contact with the Project Leader. 
Contact Details:
RAOHA BIN MEJBA (李一含)
Email: raohamejba@gmail.com
