In [1]:
import os
import PyPDF2
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# from langchain.llms import Ollama
# from langchain.chains import RetrievalQA
# from langchain.vectorstores import FAISS as LangFAISS  # Langchain wrapper for FAISS
# from langchain.embeddings import HuggingFaceEmbeddings
import streamlit as st  # For later UI
from langchain_huggingface import HuggingFaceEndpoint


from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def extract_text_from_documents(doc_directory):
    documents = []
    for filename in os.listdir(doc_directory):
        if filename.endswith((".pdf", ".txt")):
            doc_path = os.path.join(doc_directory, filename)
            text = ""
            if filename.endswith(".pdf"):
                with open(doc_path, "rb") as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    if pdf_reader.is_encrypted:
                        pdf_reader.decrypt("")  # Empty password
                    text = "\n".join([page.extract_text() or "" for page in pdf_reader.pages])
            else:  # TXT
                with open(doc_path, "r", encoding="utf-8") as file:
                    text = file.read()
            documents.append({"filename": filename, "text": text})
    return documents

# Run it
doc_directory = "C:/Users/Hp/OneDrive/Desktop/MySoft Heaven Assesments/A2/venv"  # Current folder, put your docs here
documents = extract_text_from_documents(doc_directory)
print(documents)  # Check output

[{'filename': 'Mysoft Heaven COMPANY PROFILE.pdf', 'text': "Mysoft\n \nHeaven\n \nCOMPANY\n \nPROFILE\n \n \n \nMysoft\n \nHeaven\n \n(BD)\n \nLtd.\n \noffers\n \nthe\n \ncomplete\n \nsoftware\n \nproduct\n \ndevelopment,\n \nlifecycle\n \nand\n \nsupport\n \nservices.\n \nWe\n \ncarry\n \n15+\n \nyears\n \nexperience\n \nin\n \ndesign,\n \ndevelopment\n \nand\n \nmaintaining\n \nsoftware\n \nproducts\n \nand\n \nservices.\n \nWe\n \nunderstand\n \nin\n \ntoday‚Äôs\n \ndynamic\n \nmarket,\n \nproduct\n \ndevelopment\n \nrequires\n \ninnovation,\n \ncreativity\n \nas\n \nwell\n \nas\n \nspeed\n \nto\n \ndeliver.\n \nWe\n \nconvert\n \nclients‚Äô\n \n‚ÄòProduct\n \nVision‚Äô\n \ninto\n \nproduct\n \ndevelopment\n \nThe\n \ndevelopment\n \nlife\n \ncycle\n \nis\n \ncontrolled\n \nby\n \nclient\n \ninputs\n \nand\n \ndirection.\n \nThere\n \nare\n \nno\n \nhidden\n \ncharges\n \nor\n \nany\n \nother\n \nadditional\n \ncosts\n \ninvolved.\n \nThis\n \nprovides\n \nyou\n \ncomplete\n \nindep

In [4]:
def chunk_text(text, chunk_size=500):
    chunks = []
    sentences = text.split(". ")
    current_chunk = []
    current_length = 0
    for sentence in sentences:
        sentence = sentence.strip() + ". "
        if current_length + len(sentence) > chunk_size and current_chunk:
            chunks.append("".join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence)
        else:
            current_chunk.append(sentence)
            current_length += len(sentence)
    if current_chunk:
        chunks.append("".join(current_chunk))
    return chunks

# Apply to documents
all_chunks = []
for doc in documents:
    all_chunks.extend(chunk_text(doc["text"]))
print(all_chunks[:2])  # Check first 2 chunks

["Mysoft\n \nHeaven\n \nCOMPANY\n \nPROFILE\n \n \n \nMysoft\n \nHeaven\n \n(BD)\n \nLtd.\n \noffers\n \nthe\n \ncomplete\n \nsoftware\n \nproduct\n \ndevelopment,\n \nlifecycle\n \nand\n \nsupport\n \nservices.\n \nWe\n \ncarry\n \n15+\n \nyears\n \nexperience\n \nin\n \ndesign,\n \ndevelopment\n \nand\n \nmaintaining\n \nsoftware\n \nproducts\n \nand\n \nservices.\n \nWe\n \nunderstand\n \nin\n \ntoday‚Äôs\n \ndynamic\n \nmarket,\n \nproduct\n \ndevelopment\n \nrequires\n \ninnovation,\n \ncreativity\n \nas\n \nwell\n \nas\n \nspeed\n \nto\n \ndeliver.\n \nWe\n \nconvert\n \nclients‚Äô\n \n‚ÄòProduct\n \nVision‚Äô\n \ninto\n \nproduct\n \ndevelopment\n \nThe\n \ndevelopment\n \nlife\n \ncycle\n \nis\n \ncontrolled\n \nby\n \nclient\n \ninputs\n \nand\n \ndirection.\n \nThere\n \nare\n \nno\n \nhidden\n \ncharges\n \nor\n \nany\n \nother\n \nadditional\n \ncosts\n \ninvolved.\n \nThis\n \nprovides\n \nyou\n \ncomplete\n \nindependence\n \nand\n \nflexibility\n \nto\n \nget\n \nthe\n \

In [5]:
from langchain.vectorstores import FAISS as LangFAISS

embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Free from HuggingFace

# Create FAISS index using Langchain
vector_store = LangFAISS.from_texts(all_chunks, embeddings_model)
vector_store.save_local("mysoft_faiss_index")  # Save to disk
print("Vector DB created!")

  embeddings_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")  # Free from HuggingFace


Vector DB created!


In [None]:
llm = Ollama(model="mistral")  # Your local Ollama model

# Load vector store if saved
vector_store = LangFAISS.load_local("mysoft_faiss_index", embeddings_model, allow_dangerous_deserialization=True)

# Create RAG chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  # Stuff chunks into prompt
    retriever=vector_store.as_retriever(search_kwargs={"k": 5}),  # Top 5 chunks
    return_source_documents=True  # For debugging
)

# Prompt to prevent out-of-scope answers
from langchain.prompts import PromptTemplate
prompt_template = """
You are a chatbot for Mysoft Heaven (BD) Ltd. Answer ONLY using the provided context about the company. If the query is unrelated or not in context, say "Sorry, I can only answer questions about Mysoft Heaven (BD) Ltd."
Query: {question}
Context: {context}
Answer:
"""
qa_chain.combine_documents_chain.llm_chain.prompt = PromptTemplate.from_template(prompt_template)

# Test
query = "What services does Mysoft Heaven offer?"
result = qa_chain({"query": query})
print(result["result"])  # Answer
print(result["source_documents"])  # Chunks used

 Company Name: MySoftHeaven (BD) Ltd.

Location: Bangladesh

Website: www.mysoftheaven.com

Services Offered:
1. Web Design & Development
   - Web Application Development
   - E-Commerce Solution
   - Content Management System (CMS) Development

2. Custom Software Development
   - Enterprise Resource Planning (ERP)
   - Integrated Office Management System (IOMS)
   - Customer Relationship Management (CRM)

3. Mobile Apps Development
   - Android Apps Development
   - iOS Apps Development
   - Windows Apps Development
   - Hybrid Mobile Apps Development

4. Mobile Games Development
   - Android Games Development
   - IOS Games Development
   - Windows Games Development
   - Hybrid Mobile Games Development

5. Digital Marketing
   - Social Media Marketing (SMM)
   - Email Marketing
   - SMS Marketing
   - Search Engine Optimization (SEO)

6. Advanced Technology
   - Natural Language Processing (NLP)
   - Machine Learning
   - Blockchain Technology
   - Big Data
   - Geographic Informatio

In [17]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_store.as_retriever(),
    memory=memory  # Adds history
)

In [18]:
import streamlit as st

from langchain_community.llms import Ollama
from langchain_community.vectorstores import FAISS as LangFAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
