In [66]:
!pip install pypdf
!pip install -U langchain-community
!pip install faiss-cpu
!pip install sentence_transformers



In [67]:
import os
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer
from langchain.storage import InMemoryStore
from langchain.llms import HuggingFaceHub
from langchain_core.documents import Document


# Set Hugging Face API Token
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"  # Replace with your actual token


In [68]:
# Load Hugging Face API Token
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if hf_token is None:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN is not set. Please set it in your environment variables.")


In [69]:


# Initialize Hugging Face LLM
llm = HuggingFaceHub(
    repo_id="google/flan-t5-large",
    huggingfacehub_api_token=hf_token,
    model_kwargs={"temperature": 0.7, "max_length": 512},
    task="text2text-generation"  # Specify the task explicitly
)

In [70]:
# Define FAISS index file path
INDEX_PATH = "faiss_index.bin"

# Check if FAISS index exists and load it if available
if os.path.exists(INDEX_PATH):
    index = faiss.read_index(INDEX_PATH)
    print("FAISS index loaded from disk.")
else:
    print("FAISS index not found. Rebuilding...")


FAISS index loaded from disk.


In [71]:
pdf_files = [
    "/content/RIDA FATMA Resume.pdf"
]

documents = []
for pdf_file in pdf_files:
    loader = PyPDFLoader(pdf_file)
    documents.extend(loader.load())

# Split documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
text_chunks = text_splitter.split_documents(documents)

# Extract text content
texts = [doc.page_content for doc in text_chunks]


In [72]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(texts, convert_to_tensor=False)
embedding_matrix = np.array(embeddings).astype("float32")

# Initialize FAISS index
index = faiss.IndexFlatL2(embedding_matrix.shape[1])
index.add(embedding_matrix)

# Save FAISS index to disk
faiss.write_index(index, "faiss_index.bin")


In [73]:
# Create FAISS vector store
docstore = InMemoryStore()
index_to_docstore_id = {}

document_objects = []
for i, doc in enumerate(text_chunks):
    doc_object = Document(page_content=doc.page_content, metadata=doc.metadata)
    document_objects.append(doc_object)
    index_to_docstore_id[i] = str(i)

docstore.mset([(str(i), doc) for i, doc in enumerate(document_objects)])

vector_store = FAISS(
    embedding_function=embedding_model.encode,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)


# Override `docstore.search` with `mget()`
def docstore_get(doc_id):
    docs = docstore.mget([doc_id])
    return docs[0] if docs else None

vector_store.docstore.search = docstore_get # Overriding the search method of the docstore



In [74]:
# Setup Retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Define the structured prompt
prompt_template = """
You are an intelligent assistant with expertise in providing information about Ponkrit Kaewsawee.
Your answers should be accurate, concise, and strictly based on the provided documents.
If the information is not available, kindly respond that you lack sufficient data.

Question: {question}
Answer:
"""


In [75]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)


In [76]:
# Function to ask chatbot questions
def ask_chatbot(question):
    retrieved_docs = retriever.get_relevant_documents(question)

    if not retrieved_docs:
        return "No relevant information found.", []

    response = qa_chain.invoke({"query": question})
    return response["result"], response["source_documents"]

    if not retrieved_docs:
        return "No relevant information found.", []

    response = qa_chain.invoke({"query": question})
    return response["result"], response["source_documents"]

In [77]:
import os

In [78]:
import faiss
import numpy as np
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain.llms import HuggingFaceHub


# Set the Hugging Face API Token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [79]:
# Initialize Hugging Face LLM
hf_llm = HuggingFaceHub(
    repo_id="google/flan-t5-large",
    huggingfacehub_api_token=hf_token,
    model_kwargs={"temperature": 0.7, "max_length": 512}
)

# Initialize another Hugging Face LLM (e.g., GPT-2)
hf_llm_alternate = HuggingFaceHub(
    repo_id="gpt2",
    huggingfacehub_api_token=hf_token,
    model_kwargs={"temperature": 0.7, "max_length": 512}
)


In [80]:
# Initialize Groq Cloud Llama LLM
# Replace 'your_groq_api_key' and 'your_groq_endpoint' with actual values
# groq_llm = GroqLlama(api_key="your_groq_api_key", endpoint="your_groq_endpoint")

# Define FAISS index file path
INDEX_PATH = "faiss_index.bin"

# Check if FAISS index exists and load it if available
if os.path.exists(INDEX_PATH):
    index = faiss.read_index(INDEX_PATH)
    print("FAISS index loaded from disk.")
else:
    print("FAISS index not found. Rebuilding...")

    # Load Personal Documents
    pdf_files = [
        "/content/RIDA FATMA Resume.pdf"
    ]

    documents = []
    for pdf_file in pdf_files:
        loader = PyPDFLoader(pdf_file)
        documents.extend(loader.load())


FAISS index loaded from disk.


In [81]:

    # Split documents into chunks
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    text_chunks = text_splitter.split_documents(documents)

    # Extract text content from chunks
    texts = [doc.page_content for doc in text_chunks]

    # Convert text to embeddings using SentenceTransformer
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = embedding_model.encode(texts, convert_to_tensor=False)

    # Convert embeddings to numpy array for FAISS
    embedding_matrix = np.array(embeddings).astype("float32")

    # Initialize FAISS index
    index = faiss.IndexFlatL2(embedding_matrix.shape[1])
    index.add(embedding_matrix)

    # Save FAISS index to disk
    faiss.write_index(index, INDEX_PATH)
    print("FAISS index saved to disk.")


FAISS index saved to disk.


In [82]:
# Create FAISS vector store
docstore = InMemoryStore()
index_to_docstore_id = {}

document_objects = []
for i, doc in enumerate(text_chunks):
    doc_object = Document(page_content=doc.page_content, metadata=doc.metadata)
    document_objects.append(doc_object)
    index_to_docstore_id[i] = str(i)

docstore.mset([(str(i), doc) for i, doc in enumerate(document_objects)])

vector_store = FAISS(
    embedding_function=embedding_model.encode,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)



In [83]:
# Override `docstore.search` with `mget()`
def docstore_get(doc_id):
    docs = docstore.mget([doc_id])
    return docs[0] if docs else None

vector_store.docstore.search = docstore_get

# Setup Retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})


In [84]:
prompt_template = """
You are an AI assistant designed to provide information about Rida Fatma, based exclusively on the details provided in the document below. If you are unable to find the answer to a question within the document, please respond with "The information you're seeking is not available in the provided document."

Document Content:
{context}

User's Question:
{question}

Your Response:
"""

In [85]:
# Set up LangChain RetrievalQA chain for each model
qa_chain_hf = RetrievalQA.from_chain_type(
    llm=hf_llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

qa_chain_hf_alternate = RetrievalQA.from_chain_type(
    llm=hf_llm_alternate,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)


In [86]:
# Function to ask chatbot questions for different models
def ask_chatbot(question, model="hf"):
    if model == "hf":
        qa_chain = qa_chain_hf
    elif model == "hf_alternate":
        qa_chain = qa_chain_hf_alternate
    elif model == "groq":
        # qa_chain = qa_chain_groq
        pass
    else:
        return "Invalid model specified.", []

    retrieved_docs = retriever.get_relevant_documents(question)

    if not retrieved_docs:
        return "No relevant information found.", []

    response = qa_chain.invoke({"query": question})
    return response["result"], response["source_documents"]


In [87]:
# List of reference documents
reference_documents = pdf_files
print("Reference Documents:", reference_documents)

Reference Documents: ['/content/RIDA FATMA Resume.pdf']


In [88]:
# Analysis and Problem Solving

# Models Used
retriever_model = "SentenceTransformer - all-MiniLM-L6-v2"
generator_model = "Hugging Face - google/flan-t5-large"

print("Document Retrieval Model:", retriever_model)  # Changed to 'Document Retrieval Model'
print("Response Generation Model:", generator_model) # Changed to 'Response Generation Model'

# Challenges in Providing Relevant Information
# Potential issues:
# 1. Limitations in Semantic Understanding: The retriever model might struggle to capture the nuances of the query and retrieve documents with only superficial similarity.
# 2. Prompt Engineering: The generator model might produce irrelevant responses if the prompt lacks sufficient guidance or structure to connect the query with the retrieved context.

# Strategies for Improvement
# 1. Fine-tuning for Specificity: Consider fine-tuning the retriever model on a dataset of questions and relevant documents closely aligned with your personal content.
# 2. Prompt Optimization: Experiment with different prompt structures and instructions to provide the generator model with clearer cues and expectations.
# 3. Incorporating Feedback: Introduce a feedback loop to iteratively refine the retrieval and generation process based on user interactions and performance evaluations.

Document Retrieval Model: SentenceTransformer - all-MiniLM-L6-v2
Response Generation Model: Hugging Face - google/flan-t5-large


In [89]:
!pip install streamlit



In [90]:
import os
import faiss
import numpy as np
import streamlit as st
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer
from langchain.storage import InMemoryStore
from langchain_core.documents import Document
from langchain.llms import HuggingFaceHub
from langchain import PromptTemplate

# Set page title and favicon
st.set_page_config(page_title="Rida Fatma's Resume Chatbot", page_icon=":robot_face:")


# Set the Hugging Face API Token as an environment variable
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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" # Replace with your actual token
# Load Hugging Face API Token
hf_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if hf_token is None:
    raise ValueError("HUGGINGFACEHUB_API_TOKEN is not set. Please set it in your environment variables.")

# Initialize Hugging Face LLM
hf_llm = HuggingFaceHub(
    repo_id="google/flan-t5-large",
    huggingfacehub_api_token=hf_token,
    model_kwargs={"temperature": 0.7, "max_length": 512}
)

# ... (Rest of the code for document loading, embedding,
# and RetrievalQA setup remains the same as before) ...

# Streamlit UI
st.title("Rida Fatma's Resume Chatbot :robot_face:")
st.write("Ask me anything about Rida Fatma's resume!")

user_question = st.text_input("Enter your question:")

if user_question:
    with st.spinner("Thinking..."):
        answer, source_documents = ask_chatbot(user_question)
        st.write("**Answer:**", answer)

        # Display source documents (optional)
        # st.write("**Source Documents:**")
        # for doc in source_documents:
        #     st.write(doc.page_content)


