# Method 1 - using transformer model(google gemini pro flash model)

In [12]:
# Updating the previous version
!pip install --upgrade google-generativeai




In [1]:
#Installing all the required libraries
!pip install google-generativeai PyPDF2 pandas python-docx -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Importing all the libraries for working
import os # for os related like accessing credentials, filepath
import google.generativeai as genai # Google's generative model(gemini pro)
from PyPDF2 import PdfReader # for reading the pdf file

# Load your Gemini API key
os.environ["GEMINI_API_KEY"] = "AIzaSyBPeixsCPxffFDAbIQajSVa-CLd_WL5LwE"  # NOTE: This API key will be deleted after sharing
genai.configure(api_key=os.environ["GEMINI_API_KEY"]) # setting up the environment


In [4]:
from docx import Document  # for docx file reader

# Function for extracting the raw text from sample resume files(Pdf, docx)
def extract_text_from_file(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        reader = PdfReader(file_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text.strip()

    elif ext == ".docx":
        doc = Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs])
        return text.strip()

    else:
        raise ValueError("Unsupported file type. Only .pdf and .docx are supported.")

# Calling funtion for preview
resume_text = extract_text_from_file("/content/sample_resume.pdf")  # or .docx
print("Preview:\n", resume_text[:800])


📄 Preview:
 Vijay Pagare
(+91)889XXXXX2 8 | xyz@gmail.com | Miraroad, Thane, MH, IND https://
www.linkedin.com/in/xyz/
Afrontend-leaningsoftwareengineerwhohas4.5+yearsofexperienceinbuildingandmaintaininghigh-quality(B2B)
saasproductsandwebapplications.Provenabilitytoworkindependentlyandaspartofateaminfast-moving,
resource-constraintenvironmentswhereshortturnaroundtimesareanorm.Exceptionalatleveraginginterpersonalskills
tofacilitateacollaborativerelationshipamongcross-functionalteamstogettheworkdone.Excellentproblem-solverwith
anaptitudefortroubleshootingandtheabilitytoquicklymasternewskills,technology,orarole.
PROFESSIONAL EXPERIENCE
PROPELLOR.AI
Software Engineer - FrontendPune - Remote
August2021–Present
●Architected, built and maintained business critical modules for a data uni  cation and visualis


In [5]:
# Building the prompt as per the model required for the gemini model
# Prompt is created as the requirement on what to extract from the file

def build_resume_prompt(resume_text):
    return f"""
You are a professional resume parser. Extract the following information and return only a valid JSON object with no explanations or extra formatting:

Resume Text:
\"\"\"
{resume_text}
\"\"\"

Return JSON with the following fields:
- full_name
- email
- phone
- linkedin
- github
- skills (list)
- education (list of {{degree, institution, year}})
- work_experience (list of {{title, company, start_year, end_year, description}})
- certifications (list)
- languages (list)
- summary (string)

Only return pure valid JSON. No markdown. No explanations.
"""


In [8]:
# Model loading and initilizing using gemini-1.5-flash as it is small model
def extract_resume_json(prompt):
    model = genai.GenerativeModel(model_name="models/gemini-1.5-flash")
    response = model.generate_content([prompt])
    return response.text


In [11]:
#Final calling of all functions defined above
import json
import pandas as pd

# Run the full chain
prompt = build_resume_prompt(resume_text)
response = extract_resume_json(prompt)

#For reference on how the model is working
print("Raw Gemini Output:\n", response)

# Parsing JSON
try:
    clean_output = response.strip().replace("```json", "").replace("```", "")
    parsed = json.loads(clean_output)

    # Pretty print
    print("\nFinal Parsed JSON:")
    print(json.dumps(parsed, indent=4))

except Exception as e:
    print("Failed to parse JSON:", e)


Raw Gemini Output:
 ```json
{
  "full_name": "Vijay Pagare",
  "email": "xyz@gmail.com",
  "phone": "+91889XXXXX28",
  "linkedin": "https://www.linkedin.com/in/xyz/",
  "github": null,
  "skills": [
    "Javascript",
    "Typescript",
    "React",
    "NextJS",
    "Angular",
    "Tailwind CSS",
    "HTML",
    "CSS/SCSS",
    "Git",
    "REST APIs",
    "Nodejs",
    "Linux",
    "Material Design",
    "Ant Design",
    "ES6",
    "Redux",
    "RxJS",
    "Apache Echarts",
    "D3.js",
    "Three.js",
    "Sockets",
    "PWA"
  ],
  "education": [
    {
      "degree": "Bachelor of Engineering - Computers",
      "institution": "Rajiv Gandhi Institute of Technology, Mumbai University",
      "year": 2019
    }
  ],
  "work_experience": [
    {
      "title": "Software Engineer - Frontend",
      "company": "PROPELLOR.AI",
      "start_year": 2021,
      "end_year": null,
      "description": "Architected, built and maintained business critical modules for a data unification and visual

# Method 2 - Using transformer(NER-spacy model) + Python Regex

In [None]:
# Importing all the required libraries
import pdfplumber
import spacy # transformer model
import re # for regex using pattern
import json

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline # hugging face transformer library

# Load BERT-based NER model
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [37]:
# For extracting the text from file
def extract_text_from_pdf(file_path):
    text = ''
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + '\n'
    return text


In [38]:
#For extracting ner entities which are supported by NER model
def extract_ner_entities(text):
    entities = ner_pipeline(text)
    structured = {"PER": [], "ORG": [], "LOC": [], "DATE": [], "MISC": []}
    for ent in entities:
        group = ent["entity_group"]
        if group in structured:
            structured[group].append(ent["word"])
    for key in structured:
        structured[key] = list(dict.fromkeys(structured[key]))
    return structured


In [39]:
# For extracting the sections like experience, skills etc
def split_sections(text):
    sections = {
        "summary": "",
        "work_experience": "",
        "education": "",
        "skills": ""
    }
    current_section = "summary"
    for line in text.splitlines():
        line_lower = line.strip().lower()
        if "experience" in line_lower and "project" not in line_lower:
            current_section = "work_experience"
        elif "education" in line_lower:
            current_section = "education"
        elif "skills" in line_lower or "tech stack" in line_lower:
            current_section = "skills"
        else:
            sections[current_section] += line.strip() + "\n"
    return sections


In [40]:
# For extracting the personal info
def extract_contact_info_and_summary(text):
    email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
    phone_match = re.search(r'(\+91[\s\-]?)?[0]?[789]\d{9}', text)
    email = email_match.group() if email_match else ""
    phone = phone_match.group() if phone_match else ""

    lines = text.strip().split("\n")
    summary = ""
    for i in range(5, len(lines)):
        if len(lines[i].strip()) > 40:
            summary = lines[i].strip()
            break

    address = {
        "city": "thane" if "thane" in text.lower() else "mumbai",
        "state": "mh",
        "country": "ind"
    }

    return email, phone, address, summary


In [42]:
# For extracting work experience
def extract_work_experience(text):
    work_entries = []
    jobs = re.split(r"\n(?=[A-Z].+?[-–].+?)", text)
    for block in jobs:
        lines = [l.strip() for l in block.strip().split("\n") if l.strip()]
        if len(lines) < 2:
            continue
        title = lines[0]
        company = lines[1]
        description = " ".join(lines[2:]) if len(lines) > 2 else ""
        from_date, to_date = " ", " "
        date_match = re.findall(r'([A-Za-z]+\s?\d{4})\s?[–-]\s?(Present|\w+\s?\d{4})', block)
        if date_match:
            from_date, to_date = date_match[0]
        work_entries.append({
            "company": company.lower(),
            "title": title.lower(),
            "from_date": from_date,
            "to_date": to_date,
            "description": description
        })
    return work_entries


In [43]:
# For extracting the skills
def extract_skills(text):
    skill_section = re.findall(r'(TECH\s?STACK|Skills)[\s:\-]*([\s\S]+?)(Education|Projects|Certifications|$)', text, re.IGNORECASE)
    if skill_section:
        raw_skills = skill_section[0][1]
        skills = re.split(r"[,\n]", raw_skills)
        return [{"skill": s.strip().lower()} for s in skills if s.strip()]
    return []


In [44]:
#Final call for all the defined functions above
def parse_resume(text):
    ner = extract_ner_entities(text)
    sections = split_sections(text)
    name = ner["PER"][0].split() if ner["PER"] else ["", ""]
    first_name = name[0].lower()
    last_name = name[-1].lower() if len(name) > 1 else ""

    email, phone, address, summary = extract_contact_info_and_summary(text)
    education = extract_education(sections["education"])
    work = extract_work_experience(sections["work_experience"])
    skills = extract_skills(text)

    return {
        "first_name": first_name,
        "last_name": last_name,
        "email": email,
        "phone": phone,
        "address": {
            "city": address["city"].lower(),
            "state": address["state"].lower(),
            "country": address["country"].lower()
        },
        "summary": summary,
        "education_history": education,
        "work_history": work,
        "skills": skills
    }


In [None]:
# Testing using the given sample resume
if __name__ == "__main__":
    file_path = "/content/sample_resume.pdf"
    resume_text = extract_text_from_pdf(file_path)
    parsed_output = parse_resume(resume_text)

    with open("parsed_output.json", "w") as f:
        json.dump(parsed_output, f, indent=4)

    print("✅ Resume parsed successfully. Output saved to parsed_output.json")
