In [30]:
%pip install PyPDF2



In [31]:
import json
import re
from PyPDF2 import PdfReader

# ========= CONFIG =========
pdf_path = "Sparsha_DataEngineer.pdf"
output_jsonl = "training_data.jsonl"

# ========= STEP 1: Extract PDF Text =========
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

# ========= STEP 2: Generate Q&A Pairs with Synonyms =========
def generate_qa_pairs(text):
    qa_pairs = []

    # Helper to create multiple Qs with same answer
    def add_synonyms(base_questions, answer):
        for q in base_questions:
            qa_pairs.append({
                "instruction": q,
                "input": text,
                "output": answer
            })

    # ===== Name =====
    name_match = re.search(r"^(Sparsha Koti)", text, re.MULTILINE)
    if name_match:
        add_synonyms(
            [
                "Extract the candidate's full name from the resume.",
                "What is the applicant's name?",
                "Provide the candidate name.",
                "Who is the candidate?"
            ],
            name_match.group(1)
        )

    # ===== Phone =====
    phone_match = re.search(r"\(?\d{3}\)?[ -]?\d{3}[ -]?\d{4}", text)
    if phone_match:
        add_synonyms(
            [
                "What is the candidate's phone number?",
                "Provide the contact number.",
                "Give the applicant's mobile number.",
                "What is the telephone number of the candidate?"
            ],
            phone_match.group(0)
        )

    # ===== Email =====
    email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
    if email_match:
        add_synonyms(
            [
                "What is the candidate's email address?",
                "Provide the email ID.",
                "Give the applicant's email.",
                "What is the email contact of the candidate?"
            ],
            email_match.group(0)
        )

    # ===== Summary =====
    summary_match = re.search(r"Summary\s+(.*?)\s+Skills", text, re.DOTALL)
    if summary_match:
        add_synonyms(
            [
                "Summarize the candidate's professional profile from the resume.",
                "Provide a short bio of the candidate.",
                "Give a brief career summary.",
                "What is the professional overview of the candidate?"
            ],
            summary_match.group(1).strip()
        )

    # ===== Skills =====
    skills_match = re.search(r"Skills\s+(.*?)\s+Experience", text, re.DOTALL)
    if skills_match:
        add_synonyms(
            [
                "List the candidate's skills mentioned in the resume.",
                "What technical skills does the applicant have?",
                "Provide all skills listed.",
                "Mention the tools and technologies known by the candidate."
            ],
            skills_match.group(1).strip()
        )

    # ===== Education =====
    edu_match = re.search(r"Education\s+(.*?)\s+\d", text, re.DOTALL)
    if edu_match:
        add_synonyms(
            [
                "What is the candidate's highest qualification and details?",
                "List the education details.",
                "Provide the academic background.",
                "What is the educational qualification of the applicant?"
            ],
            edu_match.group(1).strip()
        )

    return qa_pairs

# ========= STEP 3: Save JSONL =========
def save_jsonl(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)
            f.write("\n")

# ========= RUN SCRIPT =========
resume_text = extract_text_from_pdf(pdf_path)
qa_pairs = generate_qa_pairs(resume_text)
save_jsonl(qa_pairs, output_jsonl)

print(f"✅ Generated {len(qa_pairs)} Q&A pairs (with synonyms) and saved to {output_jsonl}")


✅ Generated 24 Q&A pairs (with synonyms) and saved to training_data.jsonl


In [32]:
# =========================================
# STEP 1: Install Dependencies
# =========================================
!pip install -q transformers datasets peft bitsandbytes trl accelerate


In [33]:
# =========================================
# STEP 2: Load Dataset
# =========================================
from datasets import load_dataset

dataset = load_dataset("json", data_files="training_data.jsonl")

# Split into train & validation
dataset = dataset["train"].train_test_split(test_size=0.1)

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 21
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 3
    })
})


In [34]:
!pip install -q huggingface_hub

from huggingface_hub import login

# Paste your token here when prompted
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [35]:
# =========================================
# STEP 3: Load Model & Tokenizer
# =========================================
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "mistralai/Mistral-7B-Instruct-v0.2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,              # 4-bit quantization for T4 GPU
    device_map="auto"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [36]:
# =========================================
# STEP 4: Load Model + Tokenizer
# =========================================
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    use_auth_token=True
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [37]:
# =========================================
# STEP 5: Prepare Data for SFT
# =========================================
def format_for_sft(example):
    text = f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    return {"text": text}

tokenized_dataset = dataset.map(format_for_sft)


Map:   0%|          | 0/21 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [38]:
!pip install -U transformers accelerate peft datasets huggingface_hub




In [39]:
# =========================================
# STEP 6: LoRA Config
# =========================================
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)


In [40]:
!pip install -U trl transformers accelerate datasets peft



In [41]:
# =========================================
# STEP 7: TrainingArguments
# =========================================
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="./mistral_resume_qna_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=10,
    learning_rate=2e-4,
    fp16=True,
    logging_dir="./logs",
    logging_steps=10,
    eval_steps=500,
    save_steps=500
)

In [42]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=training_args
)


Adding EOS to train dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/21 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [43]:
model.save_pretrained("./mistral_resume_qna_lora/final_adapter")
tokenizer.save_pretrained("./mistral_resume_qna_lora/final_adapter")

('./mistral_resume_qna_lora/final_adapter/tokenizer_config.json',
 './mistral_resume_qna_lora/final_adapter/special_tokens_map.json',
 './mistral_resume_qna_lora/final_adapter/chat_template.jinja',
 './mistral_resume_qna_lora/final_adapter/tokenizer.model',
 './mistral_resume_qna_lora/final_adapter/added_tokens.json',
 './mistral_resume_qna_lora/final_adapter/tokenizer.json')

In [44]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# Quantization config for 4-bit loading
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16"
)

# Load base model (auto offloads to CPU if needed)
base_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    device_map="auto",           # Automatically split between GPU/CPU
    quantization_config=bnb_config,
    use_auth_token=True
)

# Load LoRA adapter
ft_model = PeftModel.from_pretrained(base_model, "./mistral_resume_qna_lora/final_adapter")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



### Instruction:
What is the candidate's email address?

### Input:
Sparsha Koti
(940) 843-9324 |sparshakoti2023@gmail.com |LinkedIn |GitHub
Summary
Data Engineer with 2+ years of experience designing scalable pipelines using SQL, Python, Snowflake, and ETL, optimiz-
ing workflows on 10M+ records and improving performance by 55%. Passionate about building robust data architectures
that drive analytics and strategic growth.
Skills
Languages: Python, SQL (PostgreSQL, MySQL, SQL Server)
Analytics: Exploratory Data Analysis (EDA), Statistical Modeling, Hypothesis Testing, Feature Engineering
Visualization: Power BI, Tableau, QuickSight, Matplotlib, Seaborn, Excel dashboards
Big Data & ETL: PySpark, Hadoop, Kafka, Airflow, dbt, ETL Pipelines
Cloud: AWS (Lambda, S3, Glue, Athena), Snowflake, BigQuery
Software & Tools: Excel (PivotTables, VLOOKUP), Jupyter, VSCode, Google Colab, SPSS, Git, GitHub
Methodologies: Agile Scrum, Cross-Functional Collaboration, Data Governance, Stakeholder Reportin

In [45]:
from PyPDF2 import PdfReader

# Read resume text
pdf_path = "Sparsha_DataEngineer.pdf"
reader = PdfReader(pdf_path)
resume_text = "\n".join([page.extract_text() for page in reader.pages])

# Build prompt in training format
prompt = f"""### Instruction:
What is the candidate's email address?

### Input:
{resume_text}

### Response:
"""

# Tokenize and generate
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = ft_model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### Instruction:
What is the candidate's email address?

### Input:
Sparsha Koti
(940) 843-9324 |sparshakoti2023@gmail.com |LinkedIn |GitHub
Summary
Data Engineer with 2+ years of experience designing scalable pipelines using SQL, Python, Snowflake, and ETL, optimiz-
ing workflows on 10M+ records and improving performance by 55%. Passionate about building robust data architectures
that drive analytics and strategic growth.
Skills
Languages: Python, SQL (PostgreSQL, MySQL, SQL Server)
Analytics: Exploratory Data Analysis (EDA), Statistical Modeling, Hypothesis Testing, Feature Engineering
Visualization: Power BI, Tableau, QuickSight, Matplotlib, Seaborn, Excel dashboards
Big Data & ETL: PySpark, Hadoop, Kafka, Airflow, dbt, ETL Pipelines
Cloud: AWS (Lambda, S3, Glue, Athena), Snowflake, BigQuery
Software & Tools: Excel (PivotTables, VLOOKUP), Jupyter, VSCode, Google Colab, SPSS, Git, GitHub
Methodologies: Agile Scrum, Cross-Functional Collaboration, Data Governance, Stakeholder Reportin

In [48]:
from PyPDF2 import PdfReader

# ===== Step 1: Load Resume Text =====
pdf_path = "Sparsha_DataEngineer.pdf"
reader = PdfReader(pdf_path)
resume_text = "\n".join([page.extract_text() for page in reader.pages])

# ===== Step 2: Ask Question =====
question = input("\n❓ Enter your question: ").strip()

# ===== Step 3: Build Prompt =====
prompt = f"""### Instruction:
{question}

### Input:
{resume_text}

### Response:
"""

# ===== Step 4: Run Model =====
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = ft_model.generate(
    **inputs,
    max_new_tokens=50,
    temperature=0.1,
    do_sample=False
)

answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

# ===== Step 5: Extract Clean Answer =====
if "### Response:" in answer:
    answer = answer.split("### Response:")[-1].strip()

# Keep only first sentence or first line
answer = answer.split("\n")[0].split(".")[0].strip()

print(f"\n💡 Answer: {answer}")



❓ Enter your question: What's the name ?


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



💡 Answer: Sparsha Koti


In [55]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.3.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.3.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.3.0


In [60]:
import gradio as gr
from PyPDF2 import PdfReader

def ask_question(file_path, question):
    reader = PdfReader(file_path)
    resume_text = "\n".join([page.extract_text() for page in reader.pages])

    prompt = f"""### Instruction:
{question}

### Input:
{resume_text}

### Response:
"""
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "### Response:" in answer:
        answer = answer.split("### Response:")[-1].strip()

    return answer

# ✅ Updated to use type="filepath"
gr.Interface(
    fn=ask_question,
    inputs=[gr.File(type="filepath", label="Upload Resume PDF"), gr.Textbox(label="Your Question")],
    outputs="text"
).launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://89bc50f92710c2f609.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [63]:
!mkdir resume-qna

In [64]:
!mv mistral_resume_qna_lora sample_data Sparsha_DataEngineer.pdf training_data.jsonl resume-qna/

In [67]:
!git --version

git version 2.34.1


In [68]:
!git clone https://github.com/SparshaAbinethri/Resume_QnA_Generator.git

Cloning into 'Resume_QnA_Generator'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [70]:
# Go to your repo folder
%cd /content/Resume_QnA_Generator

# Create the .gitignore file
with open(".gitignore", "w") as f:
    f.write("""# Personal / Sensitive files
Sparsha_DataEngineer.pdf
*.pdf

# Model weights and large files
*.bin
*.pt
*.safetensors
*.h5
mistral_resume_qna_lora/

# Python cache
__pycache__/
*.pyc
*.pyo
*.pyd

# Virtual environment
venv/
.env

# Jupyter/Colab checkpoints
.ipynb_checkpoints/
""")

!git config --global user.name "SparshaAbinethri"
!git config --global user.email "https://github.com/SparshaAbinethri/Resume_QnA_Generator"


# Stage and commit
!git add .gitignore
!git commit -m "Added .gitignore to protect resume and large files"


/content/Resume_QnA_Generator
[main 499d23f] Added .gitignore to protect resume and large files
 1 file changed, 23 insertions(+)
 create mode 100644 .gitignore


In [72]:
!ls /content

resume-qna  Resume_QnA_Generator


In [73]:
!cp -r /content/resume-qna/* /content/Resume_QnA_Generator/

In [97]:
!git remote -v

origin	https://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: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@github.com/SparshaAbinethri/Resume_QnA_Generator.git (fetch)
origin	https://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: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@github.com/SparshaAbinethri/Resume_QnA_Generator.git (push)


In [100]:
token = "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"
!git remote set-url origin https://x-access-token:{token}@github.com/SparshaAbinethri/Resume_QnA_Generator.git
!git push origin main


Enumerating objects: 14, done.
Counting objects:   7% (1/14)Counting objects:  14% (2/14)Counting objects:  21% (3/14)Counting objects:  28% (4/14)Counting objects:  35% (5/14)Counting objects:  42% (6/14)Counting objects:  50% (7/14)Counting objects:  57% (8/14)Counting objects:  64% (9/14)Counting objects:  71% (10/14)Counting objects:  78% (11/14)Counting objects:  85% (12/14)Counting objects:  92% (13/14)Counting objects: 100% (14/14)Counting objects: 100% (14/14), done.
Delta compression using up to 12 threads
Compressing objects: 100% (13/13), done.
Writing objects: 100% (13/13), 8.41 MiB | 2.01 MiB/s, done.
Total 13 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), done.[K
To https://github.com/SparshaAbinethri/Resume_QnA_Generator.git
   8fd64f0..ad4cda1  main -> main


In [101]:
!ls


mistral_resume_qna_lora  sample_data		   training_data.jsonl
README.md		 Sparsha_DataEngineer.pdf


In [103]:
from peft import PeftModel

# If 'model' is already your fine-tuned LoRA model
model.save_pretrained("/content/mistral_resume_qna_lora_saved")
print("✅ LoRA model saved at /content/mistral_resume_qna_lora_saved")


✅ LoRA model saved at /content/mistral_resume_qna_lora_saved


In [104]:
!zip -r mistral_resume_qna_lora_saved.zip /content/mistral_resume_qna_lora_saved
from google.colab import files
files.download("mistral_resume_qna_lora_saved.zip")


  adding: content/mistral_resume_qna_lora_saved/ (stored 0%)
  adding: content/mistral_resume_qna_lora_saved/adapter_config.json (deflated 56%)
  adding: content/mistral_resume_qna_lora_saved/adapter_model.safetensors (deflated 44%)
  adding: content/mistral_resume_qna_lora_saved/README.md (deflated 66%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>