# Setting Up

In [1]:
!pip install langchain gradio sentence-transformers faiss-cpu pymilvus transformers torch datasets python-multipart

Collecting gradio
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting pymilvus
  Downloading pymilvus-2.5.0-py3-none-any.whl.metadata (5.7 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.18-py3-none-any.whl.metadata (1.8 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.0 (from gradio)
  Downloading gradio_client-1.5.0-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manyli

In [4]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.8-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.8 (from langchain-community)
  Downloading langchain-0.3.9-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.21 (from langchain-community)
  Downloading langchain_core-0.3.21-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

In [7]:
import os
from datasets import load_dataset
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from transformers import pipeline, AutoTokenizer
import gradio as gr

# Work (Steps)

In [26]:
# ---------- Step 1: Set Hugging Face Token (Securely) ----------
# Set HuggingFace token if required (use environment variables for security)
os.environ["HUGGINGFACE_TOKEN"] = "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"

# ---------- Step 2: Load Dataset ----------
# Load CV dataset from Hugging Face (ensure dataset is publicly available or provide access)
try:
    dataset = load_dataset("MichaelAI23/English_CVs")
    print("Dataset loaded successfully!")
except Exception as e:
    print(f"Failed to load dataset: {e}")
    raise

# Convert rows into LangChain Document objects
docs = [
    Document(page_content=row["overall"], metadata={})
    for row in dataset["train"]
]

Dataset loaded successfully!


In [27]:
dataset['train'][0]

{'output': "{'pe': {'s': 'Benjamin', 'e': '-4567.'}, 'ed': {'s': '- Master', 'e': ', 2009'}, 'wo': {'s': 'Data Scientist, 2014', 'e': ' techniques.'}, 'sk': {'s': 'Proficient', 'e': ' growth.'}}",
 'hobbies': 'I enjoy swimming as a hobby, as it allows me to relax and stay active outside of my work as a Data Scientist. In addition, I also love hiking in the mountains and playing the guitar in my free time.',
 'personal': 'Benjamin Costa is a guest at The Taj Exotica Resort & Spa in the Maldives. He can be reached via email at bencosta@example.com. His address is Villa 12, Oceanfront Drive, South Male Atoll, Maldives. For any inquiries or reservations, you can contact him at +960-123-4567.',
 'work_experience': 'Data Scientist, 2014 - 2016 at Google\n- Developed machine learning models to optimize search algorithms.\n- Conducted data analysis and visualization to extract actionable insights from large datasets.\n- Collaborated with cross-functional teams to implement data-driven solution

In [28]:
docs

[Document(metadata={}, page_content='Benjamin Costa is a guest at The Taj Exotica Resort & Spa in the Maldives. He can be reached via email at bencosta@example.com. His address is Villa 12, Oceanfront Drive, South Male Atoll, Maldives. For any inquiries or reservations, you can contact him at +960-123-4567.\nI enjoy swimming as a hobby, as it allows me to relax and stay active outside of my work as a Data Scientist. In addition, I also love hiking in the mountains and playing the guitar in my free time.\nData Scientist, 2014 - 2016 at Google\n- Developed machine learning models to optimize search algorithms.\n- Conducted data analysis and visualization to extract actionable insights from large datasets.\n- Collaborated with cross-functional teams to implement data-driven solutions for product enhancements.\n- Presented findings to stakeholders and provided recommendations for business strategies.\n\nSenior Data Scientist, 2012 - 2014 at Amazon\n- Led a team of data scientists in develo

In [29]:
# ---------- Step 3: Document Chunking ----------
# Split documents into chunks for efficient embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
split_docs = text_splitter.split_documents(docs)
print(f"Total chunks created: {len(split_docs)}")

Total chunks created: 2579


In [30]:
# ---------- Step 4: Embedding Generation ----------
# Use HuggingFace Embeddings model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": False}
)

In [31]:
# ---------- Step 5: FAISS Vector Store ----------
# Store embeddings in FAISS for similarity-based retrieval
db = FAISS.from_documents(split_docs, embeddings)

# Create a retriever for querying the database
retriever = db.as_retriever(search_kwargs={"k": 4})
print("FAISS Vector Store setup completed.")

FAISS Vector Store setup completed.


In [32]:
# ---------- Step 6: Load Question-Answering Model ----------
# Load the model for question-answering
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)
question_answerer = pipeline(
    "question-answering",
    model="Intel/dynamic_tinybert",
    tokenizer=tokenizer,
    return_tensors="pt"
)

Invalid model-index. Not loading eval results into CardData.


In [33]:
# ---------- Step 7: Generate Answers ----------
def generate(question):
    """Retrieve relevant documents and generate an answer."""
    try:
        docs = retriever.get_relevant_documents(question)
        if docs:
            context = docs[0].page_content
            result = question_answerer(question=question, context=context)
            return result["answer"]
        else:
            return "No relevant context found."
    except Exception as e:
        return f"An error occurred while processing your query: {e}"

In [34]:
# ---------- Step 8: Chatbot Response ----------
def respond(message, chat_history):
    """Handle chatbot responses."""
    bot_message = generate(message)
    chat_history.append((message, bot_message))
    return "", chat_history

In [35]:
# ---------- Step 9: Gradio Interface ----------
# Set up the Gradio chatbot interface
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Talent Acquisition Chatbot", height=300)
    msg = gr.Textbox(label="Ask a Question", placeholder="Type your query here...")
    btn = gr.Button("Submit")
    clear = gr.Button("Clear Chat")

    # Define button behavior
    btn.click(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    msg.submit(respond, inputs=[msg, chatbot], outputs=[msg, chatbot])
    clear.click(lambda: None, None, chatbot)



# Launching

In [36]:
# Launch the Gradio app
if __name__ == "__main__":
    print("Launching Gradio interface...")
    demo.launch(share=True)  # Use share=True for Colab

Launching Gradio interface...
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dc4481e877db805367.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
