**Step 1 : DB setup**


In [None]:
import sqlite3
conn = sqlite3.connect("recruitment_ai.db")
cursor = conn.cursor()

print("✅ New database 'recruitment_ai.db' created and connected.")


✅ New database 'recruitment_ai.db' created and connected.


In [None]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [None]:
cursor.executescript("""
CREATE TABLE IF NOT EXISTS jobs (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_title TEXT,
    company TEXT,
    location TEXT,
    summary TEXT,
    skills TEXT,
    experience TEXT,
    qualifications TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE IF NOT EXISTS candidates (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    email TEXT,
    phone TEXT,
    skills TEXT,
    experience TEXT,
    education TEXT,
    certifications TEXT,
    summary TEXT,
    uploaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
);

CREATE TABLE IF NOT EXISTS matches (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id INTEGER,
    candidate_id INTEGER,
    match_score REAL,
    shortlisted BOOLEAN DEFAULT 0,
    interview_email TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    FOREIGN KEY(job_id) REFERENCES jobs(id),
    FOREIGN KEY(candidate_id) REFERENCES candidates(id)
);
""")

conn.commit()
print("✅ Tables created successfully.")


✅ Tables created successfully.


In [None]:
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()
print("📂 Tables in DB:", tables)


📂 Tables in DB: [('jobs',), ('sqlite_sequence',), ('candidates',), ('matches',)]


In [None]:
!pip install --quiet pymupdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
[?25h



Step 2: Upload and Parse the Job Description (JD) CSV


In [None]:
import pandas as pd

jd_df = pd.read_csv("/content/job_description.csv", encoding='cp1252')
jd_df.head(10)


Unnamed: 0,Job Title,Job Description,Unnamed: 2
0,Software Engineer,Description:\nWe are seeking a skilled Softwa...,
1,Data Scientist,Job Description:\nWe are looking for a skilled...,
2,Product Manager,Description:\nWe are seeking an innovative and...,
3,Cloud Engineer,Description:\nWe are looking for a skilled Clo...,
4,Cybersecurity Analyst,Description:\nWe are looking for a skilled Cyb...,
5,Machine Learning Engineer,Description:\nWe are looking for a skilled Mac...,
6,DevOps Engineer,Description:\nWe are seeking a skilled DevOps ...,
7,Full Stack Developer,Description:\nWe are looking for a skilled Ful...,
8,Big Data Engineer,Description:\nWe are seeking a skilled Big Dat...,
9,AI Researcher,Description:\nWe are seeking an innovative AI ...,


In [None]:

jd_df = jd_df[['Job Title', 'Job Description']].copy()


jd_df.rename(columns={
    'Job Title': 'job_title',
    'Job Description': 'summary'
}, inplace=True)

jd_df.head()

Unnamed: 0,job_title,summary
0,Software Engineer,Description:\nWe are seeking a skilled Softwa...
1,Data Scientist,Job Description:\nWe are looking for a skilled...
2,Product Manager,Description:\nWe are seeking an innovative and...
3,Cloud Engineer,Description:\nWe are looking for a skilled Clo...
4,Cybersecurity Analyst,Description:\nWe are looking for a skilled Cyb...


In [None]:
!pip install huggingface_hub --quiet


In [None]:
from huggingface_hub import login

login("xxxxxx")


In [10]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_id = "google/gemma-3-4b-it"


tokenizer = AutoTokenizer.from_pretrained(model_id)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto"
).to("cuda")


pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]



model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Device set to use cuda:0


In [5]:

custom_tech_skills = {
    "web_dev": ["html", "css", "javascript", "react", "node.js", "django", "flask"],
    "data_science": ["python", "pandas", "numpy", "scikit-learn", "tensorflow", "sql"],
    "cloud": ["aws", "azure", "gcp", "terraform", "docker", "kubernetes"],
    "database": ["mysql", "postgresql", "mongodb", "sqlite"],
    "ml": ["xgboost", "lightgbm", "pytorch"],

}


flat_skill_list = [skill.lower() for sublist in custom_tech_skills.values() for skill in sublist]
def clean_skills(raw_skills):
    return [skill for skill in raw_skills if skill.lower() in flat_skill_list]


In [None]:
import re
import json

class LLMJDAgentPhi2:
    def __init__(self, pipe, skill_list=None):
        self.pipe = pipe
        self.skill_list = [s.lower() for s in skill_list] if skill_list else None

    def build_prompt(self, jd_text):
        return f"""
You are an expert HR assistant trained to extract structured data from job descriptions.

Your task is to read a job description and extract the following fields:
- "skills": A list of technical or soft skills (e.g., Python, java)
- "qualifications": Degrees,
- "experience":The number of years of experience in the mentioned role or domain
- "Certification": any Certifications
- "job_level": One of "Entry", "Mid", or "Senior" based on responsibilities/phrasing
- "Acheivements": Acheivements or awards


Return the result in **valid JSON only** and nothing else.

###

Job Description:
\"\"\"{jd_text}\"\"\"

###

JSON Output:
"""

    def clean_skills(self, raw_skills):
        if self.skill_list:
            return [s for s in raw_skills if s.lower() in self.skill_list]
        return raw_skills

    def run(self, jd_text):
        prompt = self.build_prompt(jd_text)

        result = self.pipe(
            prompt,
            max_new_tokens=512,
            do_sample=False
        )[0]["generated_text"]

        match = re.search(r"\{[\s\S]*?\}", result)
        if match:
            try:
                data = json.loads(match.group())


                data['skills'] = self.clean_skills(data.get('skills', []))
                return data

            except json.JSONDecodeError as e:
                print("⚠️ JSON parsing failed:", e)
                return {"raw_output": match.group()}

        return {"raw_output": result}


In [None]:
jd_agent = LLMJDAgentPhi2(pipe)


jd_text = jd_df['summary'].iloc[0]
output = jd_agent.run(jd_text)
print("🧠 JD Output:\n", output)




🧠 JD Output:
 {'skills': ['Python', 'Java', 'C++', 'Databases', 'Web Development', 'Software Frameworks'], 'qualifications': ["Bachelor's degree in Computer Science or a related field"], 'experience': 'Not specified', 'Certification': 'Not specified', 'job_level': 'Mid', 'Acheivements': 'Not specified'}


In [None]:
from tqdm import tqdm

skills_list = []
quals_list = []
exp_list = []
level_list = []

for jd in tqdm(jd_df['summary']):
    out = jd_agent.run(jd)
    skills_list.append(out.get('skills', []))
    quals_list.append(out.get('qualifications', []))
    exp_list.append(out.get('experience', ''))
    level_list.append(out.get('job_level', ''))

jd_df['skills_required'] = skills_list
jd_df['qualifications'] = quals_list
jd_df['experience_required'] = exp_list
jd_df['job_level'] = level_list

print("✅ JD extraction complete!")


 45%|████▌     | 9/20 [08:09<10:00, 54.63s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 20/20 [18:01<00:00, 54.09s/it]

✅ JD extraction complete!





In [None]:
jd_df[['job_title', 'skills_required', 'qualifications', 'experience_required', 'job_level']].head(3)


Unnamed: 0,job_title,skills_required,qualifications,experience_required,job_level
0,Software Engineer,"[Python, Java, C++, Databases, Web Development...",[Bachelor's degree in Computer Science or a re...,Not specified,Mid
1,Data Scientist,"[Python, R, SQL, Machine Learning]","[Bachelor’s degree in Data Science, Master’s d...",Not specified,Mid
2,Product Manager,"[Product Management, Agile Methodologies, Mark...","[Bachelor's degree in Business, Bachelor's deg...",Experience in product management,Mid


In [None]:
import json
import re

def _close_json(text):
    open_curly = text.count("{")
    close_curly = text.count("}")
    open_square = text.count("[")
    close_square = text.count("]")
    return text + ("]" * (open_square - close_square)) + ("}" * (open_curly - close_curly))

def extract_first_json(text):
    open_braces = 0
    json_start = -1
    for i, char in enumerate(text):
        if char == '{':
            if open_braces == 0:
                json_start = i
            open_braces += 1
        elif char == '}':
            open_braces -= 1
            if open_braces == 0 and json_start != -1:
                return text[json_start:i+1]
    return None


class LLMCVAgentLocal:
    def __init__(self, pipe, skill_list=None):
        self.pipe = pipe
        self.skill_list = [s.lower() for s in skill_list] if skill_list else None

    def build_prompt(self, cv_text):
        return f"""
Extract the following fields from the resume and return valid JSON (no explanation):

- name
- email
- phone
- skills: Include tech stack, programming languages, tools, libraries, certifications, and security/software/dev tools
- experience: List roles with title, company, and duration
- education: Degrees, institutions, and years
- certifications: All relevant certifications
- achievements: Awards or standout accomplishments

Resume:
\"\"\"{cv_text}\"\"\"

Return only JSON:
"""

    def _clean_skills(self, raw_skills):
        if not self.skill_list or not raw_skills:
            return raw_skills
        return [skill for skill in raw_skills if skill.lower() in self.skill_list]
    def run(self, cv_text):
       prompt = self.build_prompt(cv_text)

       try:
        response = self.pipe(
            prompt,
            max_new_tokens=1024,
            do_sample=False,
            temperature=0.0,
            pad_token_id=self.pipe.tokenizer.eos_token_id
         )[0]["generated_text"]
       except Exception as e:
          print("❌ LLM pipeline failed:", e)
          return {"error": str(e)}

       raw_json = extract_first_json(response)
       if raw_json:
         raw_json = _close_json(raw_json)
         try:
            data = json.loads(raw_json)

            return data
         except Exception as e:
            print("⚠️ JSON parse error:", e)
            return {"raw_output": raw_json}
       else:
            print("⚠️ No JSON found. Full output:\n", response)
            return {"raw_output": response}


In [None]:
import fitz
import os

cv_data = []

for filename in os.listdir("/content"):
    if filename.endswith(".pdf"):
        filepath = os.path.join("/content", filename)
        with fitz.open(filepath) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        cv_data.append({
            'filename': filename,
            'text': text
        })
        print(f"✅ Extracted text from: {filename} ({len(text)} characters)")
def clean_cv_output(cv_data):
    """
    Cleans up overlapping fields and ensures consistency in skills and certifications.
    """
    if not isinstance(cv_data, dict):
        return cv_data


    skills = set([s.strip().lower() for s in cv_data.get("skills", [])])
    certs = set([c.strip().lower() for c in cv_data.get("certifications", [])])


    cleaned_skills = [s for s in skills if s not in certs]
    cleaned_certs = [c for c in certs]


    def capitalize_list(items):
        return [item.title() for item in items]


    cv_data["skills"] = capitalize_list(cleaned_skills)
    cv_data["certifications"] = capitalize_list(cleaned_certs)


    for field in ["achievements", "education", "experience"]:
        if field not in cv_data:
            cv_data[field] = []

    return cv_data


ModuleNotFoundError: No module named 'fitz'

In [None]:
cv_agent = LLMCVAgentLocal(pipe, skill_list=flat_skill_list)


cv_text = cv_data[0]['text']
raw_output = cv_agent.run(cv_text)

cv_output = clean_cv_output(raw_output)

print("📄 Final Cleaned CV Output:\n", json.dumps(cv_output, indent=2))


In [None]:
from google.colab import files
uploaded = files.upload()

Saving CVs.zip to CVs.zip


In [None]:
import zipfile
import os

zip_path = "/content/CVs.zip"
extract_path = "/content/cv_folder"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Files extracted to:", extract_path)


✅ Files extracted to: /content/cv_folder


In [None]:
import fitz
import os
pdf_path="/content/cv_folder/CVs1"
cv_data = []

for filename in os.listdir(pdf_path):
    if filename.endswith(".pdf"):
        filepath = os.path.join(pdf_path, filename)
        with fitz.open(filepath) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
        cv_data.append({'filename': filename, 'text': text})
        print(f"📄 Extracted: {filename}")

📄 Extracted: C3539.pdf
📄 Extracted: C7315.pdf
📄 Extracted: C5756.pdf
📄 Extracted: C8564.pdf
📄 Extracted: C9534.pdf
📄 Extracted: C9369.pdf
📄 Extracted: C8203.pdf
📄 Extracted: C9628.pdf
📄 Extracted: C6507.pdf
📄 Extracted: C5649.pdf
📄 Extracted: C3142.pdf
📄 Extracted: C6951.pdf
📄 Extracted: C9282.pdf
📄 Extracted: C7733.pdf
📄 Extracted: C5661.pdf
📄 Extracted: C1212.pdf
📄 Extracted: C3017.pdf
📄 Extracted: C9207.pdf
📄 Extracted: C2775.pdf
📄 Extracted: C9342.pdf
📄 Extracted: C8886.pdf
📄 Extracted: C9897.pdf
📄 Extracted: C9919.pdf
📄 Extracted: C2430.pdf
📄 Extracted: C3627.pdf
📄 Extracted: C7093.pdf
📄 Extracted: C2872.pdf
📄 Extracted: C2652.pdf
📄 Extracted: C3315.pdf
📄 Extracted: C7535.pdf
📄 Extracted: C5767.pdf
📄 Extracted: C7543.pdf
📄 Extracted: C7230.pdf
📄 Extracted: C6384.pdf
📄 Extracted: C1320.pdf
📄 Extracted: C2808.pdf
📄 Extracted: C2720.pdf
📄 Extracted: C9142.pdf
📄 Extracted: C1547.pdf
📄 Extracted: C3912.pdf
📄 Extracted: C5786.pdf
📄 Extracted: C2603.pdf
📄 Extracted: C1070.pdf
📄 Extracted

In [None]:
import json
import time
import gc
from tqdm import tqdm

cv_agent = LLMCVAgentLocal(pipe, skill_list=flat_skill_list)


batch_size = 10
cv_outputs = []


def save_progress(outputs, path="cv_outputs_backup.json"):
    with open(path, "w") as f:
        json.dump(outputs, f, indent=2)
    print(f"💾 Progress saved to {path}")

for i in tqdm(range(0, len(cv_data), batch_size), desc="📄 Processing CVs in Batches"):
    batch = cv_data[i:i + batch_size]

    for cv in batch:
        try:
            result = cv_agent.run(cv['text'])
            result['filename'] = cv['filename']
            cv_outputs.append(result)
            print(f"✅ {cv['filename']} processed")
        except Exception as e:
            print(f"❌ Error with {cv['filename']}: {e}")


    gc.collect()
    time.sleep(2)


    save_progress(cv_outputs, path=f"cv_outputs_batch_{i//batch_size}.json")

print("\n🎉 All batches processed!")


📄 Processing CVs in Batches:   0%|          | 0/20 [00:00<?, ?it/s]

✅ C3539.pdf processed
✅ C7315.pdf processed
✅ C5756.pdf processed
✅ C8564.pdf processed
✅ C9534.pdf processed
✅ C9369.pdf processed
✅ C8203.pdf processed
✅ C9628.pdf processed
✅ C6507.pdf processed
✅ C5649.pdf processed


📄 Processing CVs in Batches:   5%|▌         | 1/20 [06:30<2:03:42, 390.63s/it]

💾 Progress saved to cv_outputs_batch_0.json
✅ C3142.pdf processed
✅ C6951.pdf processed
✅ C9282.pdf processed
✅ C7733.pdf processed
✅ C5661.pdf processed
✅ C1212.pdf processed
✅ C3017.pdf processed
✅ C9207.pdf processed
✅ C2775.pdf processed
✅ C9342.pdf processed


📄 Processing CVs in Batches:  10%|█         | 2/20 [12:50<1:55:18, 384.34s/it]

💾 Progress saved to cv_outputs_batch_1.json
✅ C8886.pdf processed
✅ C9897.pdf processed
✅ C9919.pdf processed
✅ C2430.pdf processed
✅ C3627.pdf processed
✅ C7093.pdf processed
✅ C2872.pdf processed
✅ C2652.pdf processed
✅ C3315.pdf processed
✅ C7535.pdf processed


📄 Processing CVs in Batches:  15%|█▌        | 3/20 [18:53<1:46:09, 374.70s/it]

💾 Progress saved to cv_outputs_batch_2.json
✅ C5767.pdf processed
✅ C7543.pdf processed
✅ C7230.pdf processed
✅ C6384.pdf processed
✅ C1320.pdf processed
✅ C2808.pdf processed
✅ C2720.pdf processed
✅ C9142.pdf processed
✅ C1547.pdf processed
✅ C3912.pdf processed


📄 Processing CVs in Batches:  20%|██        | 4/20 [24:49<1:37:55, 367.22s/it]

💾 Progress saved to cv_outputs_batch_3.json
✅ C5786.pdf processed
✅ C2603.pdf processed
✅ C1070.pdf processed
✅ C8518.pdf processed
✅ C6357.pdf processed
✅ C9677.pdf processed
✅ C7228.pdf processed
✅ C3863.pdf processed
✅ C1236.pdf processed
✅ C5339.pdf processed


📄 Processing CVs in Batches:  25%|██▌       | 5/20 [30:42<1:30:32, 362.18s/it]

💾 Progress saved to cv_outputs_batch_4.json
✅ C9590.pdf processed
✅ C1701.pdf processed
✅ C8782.pdf processed
✅ C3363.pdf processed
✅ C3169.pdf processed
✅ C2235.pdf processed
✅ C2546.pdf processed
✅ C6059.pdf processed
✅ C5230.pdf processed
✅ C4256.pdf processed


📄 Processing CVs in Batches:  30%|███       | 6/20 [36:47<1:24:40, 362.89s/it]

💾 Progress saved to cv_outputs_batch_5.json
✅ C4444.pdf processed
✅ C5125.pdf processed
✅ C8849.pdf processed
✅ C3761.pdf processed
✅ C1080.pdf processed
✅ C4507.pdf processed
✅ C9228.pdf processed
✅ C7842.pdf processed
✅ C7482.pdf processed
✅ C3922.pdf processed


📄 Processing CVs in Batches:  35%|███▌      | 7/20 [43:05<1:19:42, 367.90s/it]

💾 Progress saved to cv_outputs_batch_6.json
✅ C2836.pdf processed
✅ C1191.pdf processed
✅ C5804.pdf processed
✅ C7979.pdf processed
✅ C1627.pdf processed
✅ C1796.pdf processed
✅ C7996.pdf processed
✅ C5007.pdf processed
✅ C3127.pdf processed
✅ C9117.pdf processed


📄 Processing CVs in Batches:  40%|████      | 8/20 [48:56<1:12:32, 362.68s/it]

💾 Progress saved to cv_outputs_batch_7.json
✅ C2287.pdf processed
✅ C3464.pdf processed
✅ C1677.pdf processed
✅ C2607.pdf processed
✅ C7775.pdf processed
✅ C3557.pdf processed
✅ C2098.pdf processed
✅ C6239.pdf processed
✅ C6776.pdf processed
✅ C6746.pdf processed


📄 Processing CVs in Batches:  45%|████▌     | 9/20 [55:06<1:06:55, 365.03s/it]

💾 Progress saved to cv_outputs_batch_8.json
✅ C4439.pdf processed
✅ C4021.pdf processed
✅ C5121.pdf processed
✅ C1789.pdf processed
✅ C2139.pdf processed
✅ C5507.pdf processed
✅ C4999.pdf processed
✅ C7242.pdf processed
✅ C9779.pdf processed
✅ C6592.pdf processed


📄 Processing CVs in Batches:  50%|█████     | 10/20 [1:01:47<1:02:40, 376.02s/it]

💾 Progress saved to cv_outputs_batch_9.json
✅ C7492.pdf processed
✅ C6658.pdf processed
✅ C4460.pdf processed
✅ C5951.pdf processed
✅ C2669.pdf processed
✅ C8730.pdf processed
✅ C3717.pdf processed
✅ C6302.pdf processed
✅ C6129.pdf processed
✅ C1487.pdf processed


📄 Processing CVs in Batches:  55%|█████▌    | 11/20 [1:07:56<56:05, 373.92s/it]  

💾 Progress saved to cv_outputs_batch_10.json
✅ C4552.pdf processed
✅ C9567.pdf processed
✅ C1161.pdf processed
✅ C7128.pdf processed
✅ C6979.pdf processed
✅ C4331.pdf processed
✅ C8810.pdf processed
✅ C3014.pdf processed
✅ C4742.pdf processed
✅ C5390.pdf processed


📄 Processing CVs in Batches:  60%|██████    | 12/20 [1:13:44<48:47, 365.89s/it]

💾 Progress saved to cv_outputs_batch_11.json
✅ C4573.pdf processed
✅ C8760.pdf processed
✅ C7032.pdf processed
✅ C3045.pdf processed
✅ C4307.pdf processed
✅ C4523.pdf processed
✅ C5638.pdf processed
✅ C7554.pdf processed
✅ C7231.pdf processed
✅ C2838.pdf processed


📄 Processing CVs in Batches:  65%|██████▌   | 13/20 [1:19:23<41:44, 357.83s/it]

💾 Progress saved to cv_outputs_batch_12.json
✅ C8928.pdf processed
✅ C1061.pdf processed
✅ C7318.pdf processed
✅ C7550.pdf processed
✅ C2250.pdf processed
✅ C1228.pdf processed
✅ C8063.pdf processed
✅ C5725.pdf processed
✅ C3620.pdf processed
✅ C9884.pdf processed


📄 Processing CVs in Batches:  70%|███████   | 14/20 [1:25:03<35:13, 352.29s/it]

💾 Progress saved to cv_outputs_batch_13.json
✅ C5451.pdf processed
✅ C3416.pdf processed
✅ C9533.pdf processed
✅ C3830.pdf processed
✅ C3821.pdf processed
✅ C7784.pdf processed
✅ C4602.pdf processed
✅ C2256.pdf processed
✅ C6277.pdf processed
✅ C4194.pdf processed


📄 Processing CVs in Batches:  75%|███████▌  | 15/20 [1:31:04<29:34, 354.98s/it]

💾 Progress saved to cv_outputs_batch_14.json
✅ C3445.pdf processed
✅ C2144.pdf processed
✅ C4760.pdf processed
✅ C6631.pdf processed
✅ C4919.pdf processed
✅ C7497.pdf processed
✅ C7094.pdf processed
✅ C8398.pdf processed
✅ C3570.pdf processed
✅ C4482.pdf processed


📄 Processing CVs in Batches:  80%|████████  | 16/20 [1:36:34<23:09, 347.40s/it]

💾 Progress saved to cv_outputs_batch_15.json
✅ C4686.pdf processed
✅ C4024.pdf processed
✅ C1446.pdf processed
✅ C9742.pdf processed
✅ C6140.pdf processed
✅ C3019.pdf processed
✅ C7869.pdf processed
✅ C9200.pdf processed
✅ C9104.pdf processed
✅ C9945.pdf processed


📄 Processing CVs in Batches:  85%|████████▌ | 17/20 [1:42:29<17:29, 349.96s/it]

💾 Progress saved to cv_outputs_batch_16.json
✅ C6315.pdf processed
✅ C3899.pdf processed
✅ C4627.pdf processed
✅ C6583.pdf processed
✅ C6768.pdf processed
✅ C3226.pdf processed
✅ C8676.pdf processed
✅ C4336.pdf processed
✅ C9146.pdf processed
✅ C4277.pdf processed


📄 Processing CVs in Batches:  90%|█████████ | 18/20 [1:48:32<11:47, 353.60s/it]

💾 Progress saved to cv_outputs_batch_17.json
✅ C8420.pdf processed
✅ C1791.pdf processed
✅ C3771.pdf processed
✅ C5591.pdf processed
✅ C8631.pdf processed
✅ C7254.pdf processed
✅ C1499.pdf processed
✅ C8692.pdf processed
✅ C1164.pdf processed
✅ C1781.pdf processed


📄 Processing CVs in Batches:  95%|█████████▌| 19/20 [1:54:31<05:55, 355.24s/it]

💾 Progress saved to cv_outputs_batch_18.json
✅ C9165.pdf processed
✅ C7527.pdf processed
✅ C7831.pdf processed
✅ C8239.pdf processed
✅ C7934.pdf processed
✅ C7226.pdf processed
✅ C6942.pdf processed
✅ C9262.pdf processed
✅ C8035.pdf processed
✅ C9777.pdf processed


📄 Processing CVs in Batches: 100%|██████████| 20/20 [2:00:45<00:00, 362.29s/it]

💾 Progress saved to cv_outputs_batch_19.json

🎉 All batches processed!





In [None]:
import pandas as pd

cv_df = pd.DataFrame(cv_outputs)
cv_df



Unnamed: 0,name,email,phone,skills,experience,education,certifications,achievements,filename
0,Kimberly White,kimberlywhite65@gmail.com,+1-722-6171,"[Cybersecurity, Penetration testing, Risk asse...","[{'title': 'Product Manager', 'company': 'DEF ...",[{'degree': 'Master of Business Administration...,[AWS Certified Solutions Architect],[Developed an AI chatbot that reduced customer...,C3539.pdf
1,Mary Conley,maryconley69@gmail.com,+1-968-5651,"[AWS, Azure, GCP, Metasploit, Wireshark, Burp ...","[{'title': 'Software Engineer', 'company': 'XY...",[{'degree': 'Bachelor of Engineering in Inform...,[AWS Certified Solutions Architect],[Developed an AI chatbot that reduced customer...,C7315.pdf
2,James Guerra,jamesguerra91@gmail.com,+1-520-3392,"[Python, Machine Learning, TensorFlow, PyTorch...","[{'title': 'Software Engineer', 'company': 'XY...",[{'degree': 'Master of Science in Data Science...,[Certified Ethical Hacker (CEH)],[Developed an AI chatbot that reduced customer...,C5756.pdf
3,Brad Walker,bradwalker69@gmail.com,+1-704-3766,"[Python, Machine Learning, TensorFlow, PyTorch...","[{'title': 'Software Engineer', 'company': 'XY...",[{'degree': 'Bachelor of Engineering in Inform...,[Certified Ethical Hacker (CEH)],[Published a research paper on AI ethics - Con...,C8564.pdf
4,Terry Jacobson,terryjacobson44@gmail.com,+1-692-7597,"[Python, Machine Learning, TensorFlow, PyTorch...","[{'title': 'Software Engineer', 'company': 'XY...",[{'degree': 'Bachelor of Science in Computer S...,[Certified Ethical Hacker (CEH)],[Published a research paper on AI ethics - Con...,C9534.pdf
...,...,...,...,...,...,...,...,...,...
195,Sara Ferrell,saraferrell50@gmail.com,+1-620-9853,"[Java, Spring Boot, MySQL, Kafka, Azure DevOps...","[{'title': 'Product Manager', 'company': 'DEF ...","[{'degree': 'Diploma in Software Engineering',...",[Certified Ethical Hacker (CEH)],[Developed an AI chatbot that reduced customer...,C7226.pdf
196,Michael Taylor,michaeltaylor29@gmail.com,+1-930-8995,"[Cybersecurity, Penetration Testing, Risk Asse...","[{'title': 'Software Engineer', 'company': 'XY...",[{'degree': 'Bachelor of Science in Computer S...,[AWS Certified Solutions Architect],[Developed an AI chatbot that reduced customer...,C6942.pdf
197,Joseph Smith,josephsmith80@gmail.com,+1-524-2222,"[Python, Machine Learning, TensorFlow, PyTorch...","[{'title': 'Product Manager', 'company': 'DEF ...",[{'degree': 'Master of Business Administration...,[AWS Certified Solutions Architect],[Published a research paper on AI ethics - Con...,C9262.pdf
198,Amanda Howell,amandahowell76@gmail.com,+1-422-7240,"[Cybersecurity, Penetration testing, Risk asse...","[{'title': 'Product Manager', 'company': 'DEF ...",[{'degree': 'Bachelor of Science in Computer S...,[AWS Certified Solutions Architect],[Published a research paper on AI ethics - Con...,C8035.pdf


In [7]:
def extract_first_json_block(text):

    cleaned = re.sub(r"```(?:json)?|```", "", text).strip()
k
    matches = list(re.finditer(r'\{[\s\S]*?\}', cleaned))

    for match in reversed(matches):
        try:
            return json.loads(match.group())
        except json.JSONDecodeError:
            continue

    print("⚠️ JSON parse failed for all blocks.")
    print("🧾 Raw output:\n", text)
    return {"match_score": None, "explanation": None}


In [4]:
import json
import re

class LLMMatchingAgent:
    def __init__(self, pipe):
        self.pipe = pipe

    def build_prompt(self, jd_data, cv_data):
        return f"""
You are a highly experienced senior HR recruiter with over 10 years of experience in technical hiring.

Your task is to evaluate how well a candidate's resume matches a given job description.

Return ONLY the following JSON structure:
{{
  "match_score": <float>,
  "explanation": "<string>"
}}

❗Do NOT include markdown, triple backticks, or any other text.

---

Job Description:
- Skills Required: {jd_data['skills_required']}
- Qualifications: {jd_data['qualifications']}
- Experience Required: {jd_data['experience_required']}
- Job Level: {jd_data['job_level']}

Candidate Resume:
- Skills: {cv_data['skills']}
- Education: {cv_data['education']}
- Experience: {cv_data['experience']}
- Certifications: {cv_data['certifications']}
- Achievements: {cv_data['achievements']}

Return the result in JSON format only:
"""

    def run(self, jd_data, cv_data):
        prompt = self.build_prompt(jd_data, cv_data)

        try:
            response = self.pipe(
                prompt,
                max_new_tokens=512,
                do_sample=False,
                temperature=0.0,
                top_k=1,
                top_p=1.0,
                pad_token_id=self.pipe.tokenizer.eos_token_id
            )[0]["generated_text"]

            return extract_first_json_block(response)

        except Exception as e:
            print("❌ Matching failed:", e)
            return {"match_score": None, "explanation": str(e)}


In [18]:
jd_sample = jd_df.iloc[0].to_dict()
cv_sample = cv_df.iloc[0].to_dict()

match_agent = LLMMatchingAgent(pipe)
match_result = match_agent.run(jd_sample, cv_sample)

print("🎯 Match Score:", match_result.get("match_score"))
print("📝 Explanation:\n", match_result.get("explanation"))




🎯 Match Score: 0.75
📝 Explanation:
 The candidate possesses several key skills listed in the job description, including Python, Web Development, and AWS. They have a Ph.D. in AI, which aligns with the required degree. However, the candidate's experience is primarily in product management and software engineering, which is less directly relevant than the job description suggests. The candidate also has a strong background in machine learning, but this is not explicitly required. Overall, a good match but with some gaps in experience.


In [19]:
import json


jd_0 = jd_df.iloc[0].to_dict()
cv_0 = cv_df.iloc[0].to_dict()


print("🧾 Job Description #0:")
print(json.dumps(jd_0, indent=2))

print("\n" + "="*100 + "\n")

print("📄 Candidate Resume #0:")
print(json.dumps(cv_0, indent=2))


🧾 Job Description #0:
{
  "job_title": "Software Engineer",
  "summary": " Description:\nWe are seeking a skilled Software Engineer to design, develop, and maintain software applications. The ideal candidate will write efficient code, troubleshoot issues, and collaborate with teams to deliver high-quality solutions.\n\nResponsibilities:\n\nDevelop, test, and deploy software applications.\nWrite clean, maintainable, and scalable code.\nCollaborate with cross-functional teams to define and implement features.\nTroubleshoot and debug issues for optimal performance.\nStay updated with emerging technologies and best practices.\nQualifications:\n\nBachelor's degree in Computer Science or a related field.\nProficiency in programming languages like Python, Java, or C++.\nExperience with databases, web development, and software frameworks.\nStrong problem-solving skills and attention to detail.\nAbility to work both independently and in a team environment.",
  "skills_required": "['Python', 'Ja

In [11]:
from tqdm import tqdm
import json
import os
import time
import pandas as pd
import json
import re

match_agent = LLMMatchingAgent(pipe)

jd_df = pd.read_csv("/content/jd_df.csv")
cv_df = pd.read_csv("/content/cv_df.csv")


BLOCK_NUM = 1


if BLOCK_NUM == 1:
    cv_df = cv_df.iloc[:100]
elif BLOCK_NUM == 2:
    cv_df = cv_df.iloc[100:]


cv_batch_size = 10
output_dir = f"match_outputs_block_{BLOCK_NUM}"
os.makedirs(output_dir, exist_ok=True)

all_matches = []

for jd_idx, jd_row in tqdm(jd_df.iterrows(), total=len(jd_df), desc=f"📄 Matching JD to CVs [Block {BLOCK_NUM}]"):
    jd_data = jd_row.to_dict()


    for batch_start in range(0, len(cv_df), cv_batch_size):
        batch_end = batch_start + cv_batch_size
        cv_batch = cv_df.iloc[batch_start:batch_end]

        batch_results = []
        for _, cv_row in cv_batch.iterrows():
            try:
                match_result = match_agent.run(jd_data, cv_row.to_dict())
                match_result["jd_index"] = jd_idx
                match_result["cv_filename"] = cv_row["filename"]
                batch_results.append(match_result)
            except Exception as e:
                print(f"❌ Error with JD#{jd_idx} & CV#{cv_row['filename']}: {e}")
                continue

        filename = f"jd_{jd_idx}_batch_{batch_start}.json"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, "w") as f:
            json.dump(batch_results, f, indent=2)

        all_matches.extend(batch_results)
        print(f"💾 Saved: {filename}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


💾 Saved: jd_0_batch_0.json
💾 Saved: jd_0_batch_10.json
💾 Saved: jd_0_batch_20.json
💾 Saved: jd_0_batch_30.json
💾 Saved: jd_0_batch_40.json
💾 Saved: jd_0_batch_50.json
💾 Saved: jd_0_batch_60.json
💾 Saved: jd_0_batch_70.json
💾 Saved: jd_0_batch_80.json


📄 Matching JD to CVs [Block 1]:   5%|▌         | 1/20 [24:51<7:52:19, 1491.57s/it]

💾 Saved: jd_0_batch_90.json


📄 Matching JD to CVs [Block 1]:   5%|▌         | 1/20 [26:08<8:16:49, 1568.92s/it]


KeyboardInterrupt: 

In [None]:
from tqdm import tqdm
import json
import os
import time
import pandas as pd


jd_df = pd.read_csv("/content/jd_df.csv")
cv_df = pd.read_csv("/content/cv_df.csv")

BLOCK_NUM = 2


if BLOCK_NUM == 1:
    cv_df = cv_df.iloc[:100]
elif BLOCK_NUM == 2:
    cv_df = cv_df.iloc[100:]


cv_batch_size = 10
output_dir = f"match_outputs_block_{BLOCK_NUM}"
os.makedirs(output_dir, exist_ok=True)

all_matches = []


for jd_idx, jd_row in tqdm(jd_df.iterrows(), total=len(jd_df), desc=f"📄 Matching JD to CVs [Block {BLOCK_NUM}]"):
    jd_data = jd_row.to_dict()
    for batch_start in range(0, len(cv_df), cv_batch_size):
        batch_end = batch_start + cv_batch_size
        cv_batch = cv_df.iloc[batch_start:batch_end]

        batch_results = []
        for _, cv_row in cv_batch.iterrows():
            try:
                match_result = match_agent.run(jd_data, cv_row.to_dict())
                match_result["jd_index"] = jd_idx
                match_result["cv_filename"] = cv_row["filename"]
                batch_results.append(match_result)
            except Exception as e:
                print(f"❌ Error with JD#{jd_idx} & CV#{cv_row['filename']}: {e}")
                continue


        filename = f"jd_{jd_idx}_batch_{batch_start}.json"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, "w") as f:
            json.dump(batch_results, f, indent=2)

        all_matches.extend(batch_results)
        print(f"💾 Saved: {filename}")
