In [1]:
import os
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain.llms import HuggingFaceHub


import os
from langchain.llms import HuggingFaceHub

# ✅ Securely Load Hugging Face API Token
huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")  # Fix: Correct env variable name
if huggingface_token is None:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN is not set. Please set it in your environment variables.")

# ✅ Use Open-Source LLM from HuggingFace
llm = HuggingFaceHub(
    repo_id="google/flan-t5-large",
    huggingfacehub_api_token=huggingface_token,  # ✅ Pass the token explicitly
    model_kwargs={"temperature": 0.7, "max_length": 512}  # Adjust parameters as needed
)




# ✅ Load Personal Documents
pdf_files = [
    "EngTranscript.pdf",
    "myCollected_Certificate.pdf",
    "Ponkrit_CV(Eng).pdf",
    "myAIT_Application.pdf"
]

documents = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    documents.extend(loader.load())

# ✅ Split documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
text_chunks = text_splitter.split_documents(documents)

# ✅ Extract text content from chunks
texts = [doc.page_content for doc in text_chunks]

# ✅ Convert text to embeddings using SentenceTransformer
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(texts, convert_to_tensor=False)

# ✅ Convert embeddings to numpy array for FAISS
embedding_matrix = np.array(embeddings).astype("float32")

# ✅ Initialize FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

# ✅ Create FAISS vector store
docstore = InMemoryStore()
index_to_docstore_id = {}

document_objects = []
for i, doc in enumerate(text_chunks):
    doc_object = Document(page_content=doc.page_content, metadata=doc.metadata)
    document_objects.append(doc_object)
    index_to_docstore_id[i] = str(i)

# Store documents correctly
docstore.mset([(str(i), doc) for i, doc in enumerate(document_objects)])

# ✅ Ensure `mget()` is used for retrieval
vector_store = FAISS(
    embedding_function=embedding_model.encode,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# Fix: Override `docstore.search` with `mget()`
def docstore_get(doc_id):
    docs = docstore.mget([doc_id])
    return docs[0] if docs else None

vector_store.docstore.search = docstore_get  # ✅ Fix: Proper retrieval function

# ✅ Setup Retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# ✅ Use Open-Source LLM from HuggingFace
llm = HuggingFaceHub(repo_id="google/flan-t5-large")

# ✅ Define the structured prompt
prompt_template = """
You are an AI assistant specializing in answering questions about Ponkrit Kaewsawee.
Your responses should be precise, informative, and based only on the provided documents.
If the requested information is unavailable, politely state that you don’t have enough data.

Question: {question}
Answer:
"""

# ✅ Set up LangChain RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

# ✅ Function to ask chatbot questions
def ask_chatbot(question):
    retrieved_docs = retriever.get_relevant_documents(question)
    
    if not retrieved_docs:
        return "No relevant information found.", []
    
    response = qa_chain.invoke({"query": question})
    return response["result"], response["source_documents"]

# ✅ Test with a sample question
question = "What is Ponkrit Kaewsawee's highest level of education?"
answer, sources = ask_chatbot(question)

# ✅ Print answer and sources
print("Answer:", answer)
for source in sources:
    print("Source:", source.metadata.get("source", "Unknown"))


ValueError: HUGGINGFACEHUB_API_TOKEN is not set. Please set it in your environment variables.

In [None]:
import torch
import transformers

print("PyTorch Version:", torch.__version__)
print("Transformers Version:", transformers.__version__)


PyTorch Version: 2.5.1+cu121
Transformers Version: 4.49.0
