# INSTALL PACKAGE

In [1]:
!pip install pdfplumber
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# READ FILE PDF

In [12]:
PDF_PATH = "/content/FPT_CV_ReactNative_Nguyen-Cuong-Phat.pdf"

In [13]:
import pdfplumber

def load_lines(path):
    lines = []
    with pdfplumber.open(path) as pdf:
        for page in pdf.pages:
            text = page.extract_text() or ""
            for line in text.split("\n"):
                clean = line.strip()
                if clean:
                    lines.append(clean)
    return lines

PDF_PATH = PDF_PATH
lines = load_lines(PDF_PATH)


In [48]:
import re
import json
from typing import List, Dict

def filter_lines(lines: List[str]) -> List[str]:
    """Loại bỏ các dòng footer số trang dạng 'Page X of Y'."""
    return [
        ln for ln in lines
        if not re.match(r'^Page\s*\d+\s*of\s*\d+', ln, re.IGNORECASE)
    ]

def group_sections(lines: List[str]) -> Dict[str, List[str]]:
    headings = {
        'PROFESSIONAL SUMMARY',
        'EDUCATION BACKGROUND',
        'WORKING EXPERIENCE',
        'WORK EXPERIENCE',
        'CERTIFICATIONS',
        'SKILLS',
        'MY PROJECTS',
        'PROJECTS'
    }
    sections = {}
    current = None
    for ln in filter_lines(lines):
        if ln in headings:
            current = ln
            sections[current] = []
        elif current:
            sections[current].append(ln)
    return sections

def clean_bullet(ln: str) -> str:
    return re.sub(r'^[\s§•\-\*]+', '', ln).strip()

def parse_summary(lines: List[str]) -> str:
    cleaned = [clean_bullet(ln) for ln in lines]
    return " ".join(cleaned)

def parse_multi_line(lines: List[str], start_pattern: str) -> List[str]:
    items = []
    for ln in lines:
        if re.match(start_pattern, ln):
            items.append(ln)
        else:
            if items:
                items[-1] += ' ' + ln
    return items

def parse_education(lines: List[str]) -> List[str]:
    return parse_multi_line(lines, r'^\d{4}')

def parse_experience(lines: List[str]) -> List[str]:
    return parse_multi_line(lines, r'^(?:[A-Za-z]{3}\s+\d{4}|\d{4})')

def parse_certifications(lines: List[str]) -> List[str]:
    return parse_multi_line(lines, r'^\d{4}')

def parse_skills(lines: List[str]) -> Dict[str, List[str]]:
    skills = {}
    for ln in lines:
        # Bỏ qua dòng Page nếu lỡ lọt qua filter
        if ln.lower().startswith('page '):
            continue
        # Tách theo dấu ":" hoặc khoảng trắng đầu tiên
        if ':' in ln:
            cat, rest = ln.split(':', 1)
        elif ' ' in ln:
            cat, rest = ln.split(' ', 1)
        else:
            continue
        cat = cat.strip()
        items = [i.strip() for i in rest.split(',') if i.strip()]
        skills[cat] = items
    return skills

def parse_projects(lines: List[str]) -> List[Dict]:
    projects, current = [], {}
    i = 0
    while i < len(lines):
        ln = lines[i].strip()
        # Title + period
        if i+1 < len(lines) and re.match(r'^\([^)]*\)$', lines[i+1].strip()):
            if current:
                projects.append(current)
            current = {
                'title': ln,
                'period': lines[i+1].strip().strip('()'),
            }
            i += 2
            continue

        if 'Team Size' in ln and 'Customer' in ln:
            m = re.search(r'Team Size\s*(\d+).*Customer\s*(.+)', ln)
            if m:
                current['team_size'] = int(m.group(1))
                current['customer'] = m.group(2).strip()

        elif ln.startswith('Summary'):
            text = ln[len('Summary'):].strip()
            i += 1
            while i < len(lines) and not re.match(r'^(My Position|Technologies|Programming Languages)', lines[i]):
                text += ' ' + lines[i].strip()
                i += 1
            current['summary'] = text.strip()
            continue

        elif ln.startswith('My Position'):
            current['position'] = ln[len('My Position'):].strip()

        elif ln.startswith('Technologies'):
            techs = []
            i += 1
            while i < len(lines) and lines[i].lstrip().startswith('§'):
                techs.append(clean_bullet(lines[i]))
                i += 1
            current['technologies'] = techs
            continue

        elif ln.startswith('Programming Languages'):
            rest = ln[len('Programming Languages'):].strip()
            langs = [l.strip() for l in re.split(r'[,/]', rest) if l.strip()]
            current['programming_languages'] = langs

        i += 1

    if current:
        projects.append(current)
    return projects

def parse_cv(lines: List[str]) -> Dict:
    sec = group_sections(lines)
    info = {
        'name'          : lines[0].strip(),
        'summary'       : parse_summary(sec.get('PROFESSIONAL SUMMARY', [])),
        'education'     : parse_education(sec.get('EDUCATION BACKGROUND', [])),
        'experience'    : parse_experience(sec.get('WORKING EXPERIENCE', [])
                                            or sec.get('WORK EXPERIENCE', [])),
        'certifications': parse_certifications(sec.get('CERTIFICATIONS', [])),
        'skills'        : parse_skills(sec.get('SKILLS', [])),
        'projects'      : parse_projects(sec.get('MY PROJECTS', [])
                                         or sec.get('PROJECTS', []))
    }
    return info

def clean_value(val):
    """
    Recursively remove unwanted special characters from strings,
    while preserving letters, numbers, basic punctuation and spaces.
    """
    if isinstance(val, str):
        # keep letters (including Vietnamese), numbers, spaces, basic punctuation . , : - ( )
        return re.sub(r"[^0-9A-Za-zÀ-ỹ .,:\-\(\)]", "", val).strip()
    if isinstance(val, list):
        return [clean_value(v) for v in val]
    if isinstance(val, dict):
        return {k: clean_value(v) for k, v in val.items()}
    return val



In [79]:
def extract_CV_info(path):
    lines = load_lines(path)
    parsed = parse_cv(lines)
    parsed = clean_value(parsed)
    # print(json.dumps(parsed, ensure_ascii=False, indent=2))
    print("Extracted CV info")
    return parsed

In [80]:
PHAT_INFO = extract_CV_info("/content/FPT_CV_ReactNative_Nguyen-Cuong-Phat.pdf")

Extracted CV info


In [81]:
TRI_INFO = extract_CV_info("/content/FPT_CV_ReactNative_NGUYEN-MINH-TRI.pdf")

Extracted CV info


# Extract Key Features from a Job Description PDF

In [53]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [59]:
import re
import json

def parse_jd(lines):
    """
    Nhận vào list of strings 'lines' và trích:
      - metadata: job_title, location, department, reports_to, employment_type
      - sections: about_us, job_description, key_responsibilities,
                  required_qualifications, desired_skills, benefits, how_to_apply
    """
    # Mẫu detect metadata
    meta_labels = ["Job Title", "Location", "Department", "Reports To", "Employment Type"]
    meta_pattern = re.compile(r"^([^:]+):\s*(.+)$")

    # Mẫu detect bullet lines
    bullet_pattern = re.compile(r"^[·•\-\.\s]+(.+)$")

    # Tập hợp các section header
    headers = {
        "About Us", "Job Description", "Key Responsibilities",
        "Required Qualifications", "Desired Skills", "Benefits", "How to Apply"
    }

    meta = {}
    sections = {}
    current_section = None

    for raw in lines:
        line = raw.strip()
        if not line:
            continue

        # 1. Metadata
        m = meta_pattern.match(line)
        if m and m.group(1) in meta_labels:
            key = m.group(1).lower().replace(" ", "_")
            meta[key] = m.group(2)
            continue

        # 2. Section header
        if line in headers:
            current_section = line.lower().replace(" ", "_")
            sections[current_section] = []
            continue

        # 3. Nội dung trong section
        if current_section:
            # Bullet item?
            b = bullet_pattern.match(line)
            if b:
                sections[current_section].append(b.group(1).strip())
            else:
                sections[current_section].append(line)

    # Kết hợp metadata + sections
    return {**meta, **sections}


    j


In [82]:
JD_PATH = "/content/Job Description Senior React Native Developer.pdf"
JD = load_lines(JD_PATH)
# features = parse_job_description(raw_text)
# print(json.dumps(features, ensure_ascii=False, indent=2))

jd_info = parse_jd(JD)
print(json.dumps(jd_info, ensure_ascii=False, indent=2))

{
  "job_title": "Senior/Lead React Native Developer",
  "location": "[Insert Location]",
  "department": "[Insert Department]",
  "reports_to": "[Insert Reporting Structure]",
  "employment_type": "Full-time",
  "about_us": [
    "[Company Name] is a leading technology company committed to building high-quality, scalable",
    "mobile applications that empower our users. As we continue to grow, we are looking for a",
    "Senior/Lead React Native Developer to join our talented team. You will have the opportunity",
    "to lead mobile development projects, mentor junior developers, and contribute to cutting-edge",
    "mobile applications for both iOS and Android platforms."
  ],
  "job_description": [
    "As a Senior/Lead React Native Developer, you will be responsible for designing, building, and",
    "maintaining highly scalable mobile applications. You will work closely with cross-functional",
    "teams, including product managers, designers, and backend developers, to deliver e

# Matching Score

In [67]:
import os
import time
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pdfplumber
import numpy as np

In [68]:
# Load pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [83]:
print(type(TRI_INFO))

<class 'dict'>


In [88]:
def dict_to_text(data):
    text = ""
    for key, value in data.items():
        if isinstance(value, str):
            text += f"{key}: {value}\n"
        elif isinstance(value, list):
            text += f"{key}:\n"
            for item in value:
                if isinstance(item, str):
                    text += f"- {item}\n"
                elif isinstance(item, dict):
                    text += f"- {dict_to_text(item)}\n"
        elif isinstance(value, dict):
            text += f"{key}:\n{dict_to_text(value)}\n"
    return text

In [91]:
def matching_with_BERT(cv_text, jd_text):
  start_time = time.time()

  # Process CV with BERT
  inputs_cv = tokenizer(cv_text, return_tensors="pt", truncation=True, max_length=512)
  with torch.no_grad():
      cv_embeddings = model(**inputs_cv).last_hidden_state.mean(dim=1)


  # Process job description with BERT
  inputs_jd = tokenizer(jd_text, return_tensors="pt", truncation=True, max_length=512)
  with torch.no_grad():
      jd_embeddings = model(**inputs_jd).last_hidden_state.mean(dim=1)

  # Calculate cosine similarity
  similarity = cosine_similarity(cv_embeddings.numpy(), jd_embeddings.numpy())
  matching_percent = similarity[0][0] * 100

  end_time = time.time()
  processing_time = end_time - start_time

  print(f"Matching Percentage: {matching_percent:.2f}%")
  print(f"Processing Time: {processing_time:.2f} seconds")

In [92]:
# Extract relevant text from CV and job description
cv_text = dict_to_text(TRI_INFO)
jd_text = dict_to_text(jd_info)

matching_with_BERT(cv_text, jd_text)

  return forward_call(*args, **kwargs)


Matching Percentage: 90.52%
Processing Time: 3.19 seconds


In [93]:
# Extract relevant text from CV and job description
cv_text = dict_to_text(PHAT_INFO)
jd_text = dict_to_text(jd_info)

matching_with_BERT(cv_text, jd_text)

  return forward_call(*args, **kwargs)


Matching Percentage: 92.41%
Processing Time: 3.92 seconds


# DEFINE MATCHING CAL