In [1]:
pip install langchain langchain-community langchain-openai faiss-cpu pypdf python-dotenv


Note: you may need to restart the kernel to use updated packages.


In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [3]:
loader = TextLoader("data/info.txt")
documents = loader.load()

print("Documents loaded:", len(documents))
documents[0]


Documents loaded: 1


Document(metadata={'source': 'data/info.txt'}, page_content='DevelopersHub Internship Program\n\nThe internship focuses on AI, Machine Learning, and Data Science.\nInterns work on real-world projects including NLP, chatbots, and data analysis.\nThe duration of the internship is 8 weeks.\nCertificates are provided after successful completion.\n')

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

docs = text_splitter.split_documents(documents)

print("Chunks created:", len(docs))
docs[0]


Chunks created: 1


Document(metadata={'source': 'data/info.txt'}, page_content='DevelopersHub Internship Program\n\nThe internship focuses on AI, Machine Learning, and Data Science.\nInterns work on real-world projects including NLP, chatbots, and data analysis.\nThe duration of the internship is 8 weeks.\nCertificates are provided after successful completion.')

In [5]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


  embeddings = HuggingFaceEmbeddings(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
vectorstore = FAISS.from_documents(docs, embeddings)


In [7]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


In [10]:
def chatbot(query):
    results = retriever.invoke(query)

    print("Answer based on context:\n")
    for doc in results:
        print(doc.page_content)
        print("-" * 60)


In [11]:
chatbot("How long is the internship?")


Answer based on context:

DevelopersHub Internship Program

The internship focuses on AI, Machine Learning, and Data Science.
Interns work on real-world projects including NLP, chatbots, and data analysis.
The duration of the internship is 8 weeks.
Certificates are provided after successful completion.
------------------------------------------------------------


In [12]:
chatbot("What do interns work on?")


Answer based on context:

DevelopersHub Internship Program

The internship focuses on AI, Machine Learning, and Data Science.
Interns work on real-world projects including NLP, chatbots, and data analysis.
The duration of the internship is 8 weeks.
Certificates are provided after successful completion.
------------------------------------------------------------


In [None]:
# Task 4: Context-Aware Chatbot

# A document-based chatbot was developed using LangChain and FAISS.
# Text data was embedded using a local sentence-transformer model and stored in a vector database.
# User queries retrieve the most relevant document chunks using semantic similarity, enabling context-aware responses without relying on external APIs.