<a href="https://colab.research.google.com/github/SathwikaAkkala/BreakoutAI.tech-/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# %% [markdown]
# # AI-Powered Resume Chatbot (Colab)
# Upload your resume (PDF or TXT) and chat with it.
# Uses: sentence-transformers (embeddings), faiss (vector store), and Flan-T5 for generation.
# Optional: Use OpenAI by setting OPENAI_API_KEY environment variable (see cell below).


In [6]:
# Install required libraries (run once)
!pip install -q sentence-transformers faiss-cpu transformers pyPDF2 gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
# %% [markdown]
# ## 1) Imports


In [7]:
# %%
import os
import tempfile
from pathlib import Path
from typing import List

import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import PyPDF2
import gradio as gr


In [8]:
# %% [markdown]
# ## 2) Config: choose model names
# - Embedding model: `all-MiniLM-L6-v2` (small, fast)
# - Generator: `google/flan-t5-base` (small-medium text2text)
# If you prefer OpenAI GPT, the optional OpenAI branch is below.


In [9]:
# %%
EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL_NAME = "google/flan-t5-base"   # small/fast; change if you'd like another HF model


In [10]:
# %%
def read_pdf_text(path: str) -> str:
    text = []
    with open(path, "rb") as f:
        reader = PyPDF2.PdfReader(f)
        for p in reader.pages:
            text.append(p.extract_text() or "")
    return "\n".join(text)

def read_txt_text(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def load_resume(file_path: str) -> str:
    if file_path.lower().endswith(".pdf"):
        return read_pdf_text(file_path)
    else:
        return read_txt_text(file_path)

def chunk_text(text: str, chunk_size:int=400, overlap:int=50) -> List[str]:
    # split into sentence-aware chunks (simple)
    words = text.split()
    chunks = []
    i=0
    while i < len(words):
        chunk = words[i:i+chunk_size]
        chunks.append(" ".join(chunk))
        i += chunk_size - overlap
    return [c for c in chunks if len(c.strip())>20]


In [11]:
# %%
# load embedding model
embed_model = SentenceTransformer(EMBED_MODEL_NAME)

def build_faiss_index(chunks: List[str]):
    embeddings = embed_model.encode(chunks, show_progress_bar=True, convert_to_numpy=True)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index, embeddings


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
# %%
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL_NAME)
gen_model = AutoModelForSeq2SeqLM.from_pretrained(GEN_MODEL_NAME)
generator = pipeline("text2text-generation", model=gen_model, tokenizer=tokenizer, device=-1)  # CPU-friendly


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


In [13]:
# %%
def retrieve(query: str, index, chunks: List[str], topk:int=4):
    q_emb = embed_model.encode([query], convert_to_numpy=True)
    D, I = index.search(q_emb, topk)
    results = []
    for idx in I[0]:
        if idx < len(chunks):
            results.append(chunks[idx])
    return results

def build_prompt(query: str, context_chunks: List[str]) -> str:
    instruction = (
        "You are an assistant that answers from the candidate's resume only. "
        "Use the context below to answer the question succinctly. "
        "If the information isn't present in the resume, say 'I don't have that information in the resume.'\n\n"
    )
    context = "\n\n---\n\n".join(context_chunks)
    prompt = f"{instruction}CONTEXT:\n{context}\n\nQUESTION: {query}\n\nANSWER:"
    return prompt

def answer_with_flan(query: str, index, chunks: List[str], topk:int=4):
    ctx = retrieve(query, index, chunks, topk=topk)
    prompt = build_prompt(query, ctx)
    out = generator(prompt, max_length=256, do_sample=False)[0]['generated_text']
    return out.strip()


In [14]:
# %%
# Global placeholders (will be set when user uploads)
GLOBAL_CHUNKS = []
GLOBAL_INDEX = None
GLOBAL_EMBEDDINGS = None

def setup_from_resume(file):
    global GLOBAL_CHUNKS, GLOBAL_INDEX, GLOBAL_EMBEDDINGS
    # save uploaded file
    tmp = tempfile.mkdtemp()
    file_path = Path(tmp) / file.name
    with open(file_path, "wb") as f:
        f.write(file.read())
    text = load_resume(str(file_path))
    chunks = chunk_text(text, chunk_size=350, overlap=50)
    if not chunks:
        return "Could not parse resume text. Make sure the file has selectable text (not scanned image)."
    idx, embs = build_faiss_index(chunks)
    GLOBAL_CHUNKS = chunks
    GLOBAL_INDEX = idx
    GLOBAL_EMBEDDINGS = embs
    return f"Resume loaded: {len(chunks)} chunks indexed. Ask questions now!"

def chat_fn(message):
    if GLOBAL_INDEX is None:
        return "Upload and set up your resume first (use the Upload button)."
    resp = answer_with_flan(message, GLOBAL_INDEX, GLOBAL_CHUNKS, topk=4)
    return resp


In [16]:
# ✅ Fixed Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# 🎯 AI Resume Chatbot\nUpload your resume (PDF or TXT), then ask questions about it.")

    # Upload & setup
    with gr.Row():
        file_in = gr.File(label="Upload resume (PDF or TXT)")
        setup_btn = gr.Button("Load & Index Resume")
    status = gr.Textbox(label="Status", interactive=False)

    # Chat UI
    chatbot = gr.Chatbot(label="Chat with your Resume 🤖")
    msg = gr.Textbox(placeholder="Ask about projects, education, or skills...")
    clear = gr.Button("Clear Chat")

    # Functions
    def on_setup(file):
        if file is None:
            return "⚠️ Please upload a file first."
        result = setup_from_resume(file)
        return result

    def on_submit(message, chat_history):
        if GLOBAL_INDEX is None:
            return "", chat_history + [("System", "⚠️ Please upload and index your resume first.")]
        response = chat_fn(message)
        chat_history.append((message, response))
        return "", chat_history

    # Button bindings
    setup_btn.click(fn=on_setup, inputs=file_in, outputs=status)
    msg.submit(fn=on_submit, inputs=[msg, chatbot], outputs=[msg, chatbot])
    clear.click(lambda: [], None, chatbot)

demo.launch(share=True)


  chatbot = gr.Chatbot(label="Chat with your Resume 🤖")


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://24e2ae2a62b84bfce5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


