In [1]:
!pip install transformers sentence-transformers nltk pymupdf
!python -m nltk.downloader stopwords

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.

In [2]:
from google.colab import files
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

Saving resume.txt to resume.txt


In [3]:
import fitz  # PyMuPDF

def extract_text(file_path):
    if file_path.endswith(".pdf"):
        doc = fitz.open(file_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    elif file_path.endswith(".txt"):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    else:
        return ""

In [4]:
import re

def clean_resume_text(text):
    text = re.sub(r'\n+', ' ', text)
    text = re.sub(r'[•\-\*]+', ' ', text)  # remove bullets
    text = re.sub(r'http\S+|www\S+', '', text)  # remove URLs
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

In [5]:
from transformers import pipeline

ner_pipeline = pipeline("ner", model="Jean-Baptiste/roberta-large-ner-english", grouped_entities=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/849 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/255 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cpu


In [12]:
from collections import defaultdict
from sentence_transformers import SentenceTransformer, util

# Load sentence embedding model for fuzzy matching
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Skills database
SKILLS_DB = ['python', 'java', 'sql', 'nlp', 'machine learning', 'deep learning', 'pandas', 'numpy', 'tensorflow', 'flask']

# Degrees and keywords
degree_keywords = ['b.tech', 'bachelor', 'm.tech', 'mba', 'ph.d', 'msc', 'b.sc', 'computer science', 'engineering']
exp_keywords = ['years', 'months', 'experience', 'worked', 'employment', 'intern', 'role']

def extract_structured_info(text):
    clean_text = clean_resume_text(text)
    ner_results = ner_pipeline(clean_text)
    structured = {
        "Name": None,
        "Institutions": set(),
        "Degree": set(),
        "Work_Experience": set(),
        "Skills": set()
    }

    # Name from top lines
    top_lines = '\n'.join(text.split('\n')[:5])
    top_ner = ner_pipeline(top_lines)
    for ent in top_ner:
        if ent['entity_group'] == 'PER':
            structured["Name"] = ent['word']
            break

    # Institutions via NER
    for item in ner_results:
        if item['entity_group'] == "ORG":
            structured["Institutions"].add(item['word'])

    # Degree keywords
    for keyword in degree_keywords:
        if keyword in clean_text.lower():
            structured["Degree"].add(keyword.title())

    # Experience (regex-based)
    exp_matches = re.findall(r'\d+\+?\s*(years?|months?)', clean_text.lower())
    for match in exp_matches:
        structured["Work_Experience"].add(match)

    # Experience context (lines with experience keywords)
    for line in text.split('\n'):
        if any(k in line.lower() for k in exp_keywords):
            structured["Work_Experience"].add(line.strip())

    # Fuzzy Skill Matching using embeddings
    resume_embedding = embedding_model.encode(clean_text, convert_to_tensor=True)

    for skill in SKILLS_DB:
        score = util.cos_sim(
            embedding_model.encode(skill, convert_to_tensor=True),
            resume_embedding
        )[0][0].item()

        if score > 0.3 or skill.lower() in clean_text.lower():
            structured["Skills"].add(skill)

    # Convert sets to lists
    for key in structured:
        if isinstance(structured[key], set):
            structured[key] = list(structured[key])

    return structured

In [13]:
resume_text = extract_text(file_name)
info = extract_structured_info(resume_text)

import json
print(json.dumps(info, indent=4))

{
    "Name": " Amit Verma",
    "Institutions": [
        " Google",
        " IIT Bombay",
        " TCS Research"
    ],
    "Degree": [
        "Engineering",
        "Computer Science",
        "Bachelor",
        "Mba"
    ],
    "Work_Experience": [
        "Internship:",
        "- Worked on natural language processing (NLP) projects using spaCy and BERT",
        "Experience:",
        "- Worked on data cleaning and preprocessing using Pandas and Numpy",
        "Machine Learning Intern at TCS Research, Pune \u2014 Jan 2020 to June 2020"
    ],
    "Skills": [
        "sql",
        "numpy",
        "python",
        "tensorflow",
        "pandas",
        "machine learning",
        "nlp",
        "java",
        "deep learning"
    ]
}
