In [None]:
!pip install python-docx
!pip install PyPDF2

In [None]:
# upload the CSV file into the notebook
from google.colab import files
uploaded = files.upload()

In [None]:
import gradio as gr
import PyPDF2, docx, json, random
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection


# -----------------------------
# 1️⃣ Connect to Zilliz Cloud
# -----------------------------
connections.connect(
    alias="default",
    uri="https://in03-feb569ec82b1b76.serverless.aws-eu-central-1.cloud.zilliz.com",  # your URI
    token="520e0e883ef97fcc4663dca8514090a2a491dd29b714711e961807fdbff8a163d82abd308cf1a8ef546698eb1759aa7054cbc295")


collection = Collection("interveiw_Knowledge")
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [None]:
# -----------------------------
# 2️⃣ Load LLM (small one for demo)
# -----------------------------


# Load LLM (small one for demo)
chatbot_model = pipeline(
    "text-generation",
    model="HuggingFaceH4/zephyr-7b-beta",
    device_map="auto",
    torch_dtype="auto"
)


In [None]:
import gradio as gr
import PyPDF2, docx, json, random
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from pymilvus import connections, Collection
import random

# Store sessions for mock interview
interview_sessions = {}

# -----------------------------
# 3️⃣ File Extractors
# -----------------------------
def extract_text_from_pdf(file_path):
    text = ""
    with open(file_path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text.strip()


def extract_text_from_docx(file_path):
    doc = docx.Document(file_path)
    return "\n".join([p.text for p in doc.paragraphs if p.text.strip()])


def extract_text(file_path):
    if file_path.endswith(".pdf"):
        return extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        return extract_text_from_docx(file_path)
    return ""


# -----------------------------
# 4️⃣ RAG Retrieval Function
# -----------------------------
def retrieve_questions_from_zilliz(cv_text, jd_text, top_k=5):
    # If user uploads blank or short files
    if not cv_text.strip() or not jd_text.strip():
        return get_random_questions_from_collection(top_k)

    # Create a meaningful semantic search query
    query = f"{jd_text[:500]} related interview questions for candidate with {cv_text[:500]}"
    query_vector = embedder.encode([query]).tolist()

    search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}

    results = collection.search(
        data=query_vector,
        anns_field="embedding",
        param=search_params,
        limit=50,
        output_fields=["text"]
    )

    # Extract retrieved question texts
    hits = [hit.entity.get("text") for hit in results[0] if hit.distance > 0]

    # Remove duplicates while preserving order
    hits = list(dict.fromkeys(hits))

    # Fallback if no results found
    if not hits:
        hits = get_random_questions_from_collection(top_k)

    # Randomize subset
    hits = random.sample(hits, min(top_k, len(hits)))
    return hits


def get_random_questions_from_collection(n=5):
    """Fetch random questions from the whole collection."""
    data = collection.query(expr="", output_fields=["text"], limit=100)
    all_qs = [d["text"] for d in data]
    return random.sample(all_qs, min(n, len(all_qs)))

# -----------------------------
# 5️⃣ Generate Summary + Questions
# -----------------------------
def generate_summary_and_questions(cv_file, jd_file):
    if not cv_file or not jd_file:
        return "⚠️ Please upload both a CV and Job Description.", []

    cv_text = extract_text(cv_file)
    jd_text = extract_text(jd_file)

    questions = retrieve_questions_from_zilliz(cv_text, jd_text, top_k=5)

    if not questions:
        return "No relevant questions found. Try a different job description.", []

    summary = (
        f"🎯 **Top {len(questions)} relevant questions retrieved from knowledge base:**\n\n"
        + "\n".join([f"{i+1}. {q}" for i, q in enumerate(questions)])
    )
    return summary, questions


# -----------------------------
# 6️⃣ Feedback Generator
# -----------------------------
def give_feedback(answer, question):
    prompt = (
        f"Evaluate the following candidate answer to a technical question.\n\n"
        f"Question: {question}\n"
        f"Candidate's Answer: {answer}\n\n"
        f"Your response must include only:\n"
        f"1️⃣ Evaluation: State 'Correct', 'Partially Correct', or 'Incorrect'.\n"
        f"2️⃣ Correct Answer: Write the ideal answer in 1–2 short sentences.\n"
        f"3️⃣ Review:\n"
        f"   - ✅ Strengths (1 short sentence)\n"
        f"   - ⚠️ Areas for improvement (1 short sentence)\n\n"
        f"Keep your total response under 100 words. Do NOT show any examples or repeat the question."
    )

    response = chatbot_model(
        prompt,
        max_new_tokens=150,
        do_sample=False,
        temperature=0.3,
        top_p=0.9
    )[0]["generated_text"]

    return response.replace(prompt, "").strip()



## -----------------------------
# 7️⃣ Mock Interview Logic (Final Fixed Version)
# -----------------------------
def start_interview(questions):
    """Start the interview and initialize session."""
    if not questions:
        return "⚠️ No questions available. Please upload CV + JD first.", {}

    session_data = {"questions": questions, "current": 0}
    first_q = questions[0]
    status_msg = f"🎤 **Starting personalized mock interview!**\n\n🧠 **First question:**\n{first_q}"
    return status_msg, session_data


def chat_with_bot(message, history, session_data):
    """Handle user responses and provide feedback."""
    if not session_data or "questions" not in session_data or not session_data["questions"]:
        return "⚠️ Please start the mock interview first.", session_data

    q_index = session_data.get("current", 0)
    questions = session_data["questions"]

    current_question = questions[q_index]
    feedback = give_feedback(message, current_question)

    q_index += 1
    if q_index < len(questions):
        session_data["current"] = q_index
        next_q = questions[q_index]
        progress = f"({q_index}/{len(questions)})"
        reply = f"💬 **Feedback:** {feedback}\n\n➡️ **Next question {progress}:** {next_q}"
    else:
        reply = f"💬 **Final Feedback:** {feedback}\n\n✅ Interview finished. Great job!"
        session_data = {}  # reset after interview

    return reply, session_data


# -----------------------------
# 8️⃣ Gradio Interface
# -----------------------------
with gr.Blocks(theme=gr.themes.Soft(), title="RAG Mock Interview Coach") as demo:
    gr.Markdown("## 🤖 CV + JD Powered Mock Interview Coach (Now with RAG & Zilliz Cloud)")

    with gr.Tabs():
        # Tab 1: Upload & Analyze
        with gr.Tab("Upload & Analyze"):
            gr.Markdown("### 📄 Upload your CV and Job Description to generate relevant questions.")
            with gr.Row():
                cv_upload = gr.File(label="Upload CV (PDF or DOCX)", type="filepath")
                jd_upload = gr.File(label="Upload Job Description (PDF or DOCX)", type="filepath")

            output = gr.Markdown(label="Extracted Info & Questions")
            role_questions = gr.State([])

            generate_btn = gr.Button("🔍 Analyze CV + JD & Retrieve Questions")
            generate_btn.click(
                fn=generate_summary_and_questions,
                inputs=[cv_upload, jd_upload],
                outputs=[output, role_questions]
            )

        # Tab 2: Mock Interview
        with gr.Tab("Mock Interview"):
            gr.Markdown("### 🎤 Start Your Personalized Mock Interview")

            # Session state to persist across chat
            session_state = gr.State({})

            start_btn = gr.Button("🚀 Start Mock Interview")
            start_output = gr.Markdown(label="Interview Status")

            # Start interview properly — split text + state
            def start_and_show(questions):
                message, session = start_interview(questions)
                return message, session

            start_btn.click(
                fn=start_and_show,
                inputs=[role_questions],
                outputs=[start_output, session_state]
            )

            chatbot = gr.ChatInterface(
                fn=chat_with_bot,
                title="Interview Coach",
                description="💬 Answer the question, and get instant feedback!",
                additional_inputs=[session_state],
                additional_outputs=[session_state]
            )

demo.launch()

Building RAG

In [None]:
!pip install pymilvus


In [None]:
url = "in03-feb569ec82b1b76.serverless.aws-eu-central-1.cloud.zilliz.com"
token = "520e0e883ef97fcc4663dca8514090a2a491dd29b714711e961807fdbff8a163d82abd308cf1a8ef546698eb1759aa7054cbc295"

In [None]:
from pymilvus import connections

connections.connect(
    alias="default",
    uri="https://in03-feb569ec82b1b76.serverless.aws-eu-central-1.cloud.zilliz.com",
    token="520e0e883ef97fcc4663dca8514090a2a491dd29b714711e961807fdbff8a163d82abd308cf1a8ef546698eb1759aa7054cbc295"
)

In [None]:
from pymilvus import Collection, connections
from sentence_transformers import SentenceTransformer
import json


# -----------------------------
# 2️⃣ Load Collection
# -----------------------------
collection = Collection("interveiw_Knowledge")

# -----------------------------
# 3️⃣ Load JSON Data
# -----------------------------
with open("interview_data.json", "r") as f:
    data = json.load(f)

# -----------------------------
# 4️⃣ Create Texts + Embeddings
# -----------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [f"{item['role']} - {item['skill']} - {item['question']}" for item in data]
embeddings = model.encode(texts).tolist()

print(f"Embedding dimension: {len(embeddings[0])}")

# -----------------------------
# 5️⃣ Insert (ORDER MATTERS!)
# -----------------------------
entities = [texts, embeddings]  # text first, then embedding
insert_result = collection.insert(entities)
collection.flush()

print(f"✅ Inserted {len(texts)} records successfully!")
print("Inserted IDs:", insert_result.primary_keys)

# -----------------------------
# 6️⃣ Verify entity count
# -----------------------------
print("Total entities in collection:", collection.num_entities)


In [None]:
query = "Explain the difference between supervised and unsupervised learning."
query_vector = model.encode([query]).tolist()

search_params = {"metric_type": "COSINE", "params": {"nprobe": 10}}
results = collection.search(
    data=query_vector,
    anns_field="embedding",
    param=search_params,
    limit=3,
    output_fields=["text"]
)

print("\n🔍 Top matches:")
for hit in results[0]:
    print(f"Score: {hit.distance:.4f} | Text: {hit.entity.get('text')}")


In [None]:
import pandas as pd
from pymilvus import Collection, connections
from sentence_transformers import SentenceTransformer

# -----------------------------
# 1️⃣ Connect to Zilliz Cloud
# -----------------------------
connections.connect(
    alias="default",
    uri="https://in03-feb569ec82b1b76.serverless.aws-eu-central-1.cloud.zilliz.com",
    token="520e0e883ef97fcc4663dca8514090a2a491dd29b714711e961807fdbff8a163d82abd308cf1a8ef546698eb1759aa7054cbc295"
)

# -----------------------------
# 2️⃣ Load Existing Collection
# -----------------------------
collection = Collection("interveiw_Knowledge")

# -----------------------------
# 3️⃣ Load CSV Data
# -----------------------------
df = pd.read_csv("interview_dataset_enhanced.csv")

print(f"Loaded {len(df)} rows from CSV")

# -----------------------------
# 4️⃣ Create Combined Texts
# -----------------------------
texts = [f"{row.role} - {row.skill} - {row.question}" for _, row in df.iterrows()]

# -----------------------------
# 5️⃣ Create Embeddings
# -----------------------------
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(texts).tolist()

print(f"✅ Embedding dimension: {len(embeddings[0])}")

# -----------------------------
# 6️⃣ Insert into Collection
# -----------------------------
entities = [texts, embeddings]  # Must match your schema order
insert_result = collection.insert(entities)
collection.flush()

print(f"✅ Inserted {len(texts)} records successfully!")
print("Inserted IDs:", insert_result.primary_keys)

# -----------------------------
# 7️⃣ Verify Entity Count
# -----------------------------
print("Total entities in collection:", collection.num_entities)


In [None]:
from pymilvus import Collection, connections

connections.connect(
    alias="default",
    uri="https://in03-feb569ec82b1b76.serverless.aws-eu-central-1.cloud.zilliz.com",
    token="520e0e883ef97fcc4663dca8514090a2a491dd29b714711e961807fdbff8a163d82abd308cf1a8ef546698eb1759aa7054cbc295"
)

collection = Collection("interveiw_Knowledge")
collection.load()

print(collection.schema)  # 👈 check field names + dimensions

# Preview 5 entries
results = collection.query(expr="", output_fields=["text"], limit=5)
for r in results:
    print(r["text"])
