In [None]:
# Setting Notebook vars

from pathlib import Path

INPUT_PDF_DIR = Path("./data/pdfs")
INPUT_PDF_DIR.mkdir(parents=True, exist_ok=True)

OUTPUT_JSONL_PATH = Path("./data/labelstudio/resumes.jsonl")
OUTPUT_JSONL_PATH.parent.mkdir(parents=True, exist_ok=True)

# spaCy training paths
CONFIG_PATH = Path("./config.cfg")
TRAIN_JSON = Path("./train.json")
DEV_JSON = Path("./dev.json")
TRAIN_SPACY = Pat("./train.spacy")
DEV_SPACY = Path("./dev.spacy")
OUTPUT_DIR = Path("./output")
GPU_ID = 0  # set to -1 to run on CPU

## Step 1: Upload data: use json dataset with parsed resumes:

In [None]:
from pathlib import Path
from IPython.display import display
import ipywidgets as widgets

TRAIN_JSON_PATH = Path("./parsed/train.json")
DEV_JSON_PATH = Path("./parsed/dev.json")

train_uploader = widgets.FileUpload(accept=".json", multiple=False)
dev_uploader = widgets.FileUpload(accept=".json", multiple=False)
out = widgets.Output()

def on_save(_):
    with out:
        out.clear_output()
        if not train_uploader.value or not dev_uploader.value:
            print("Please upload both train.json and dev.json")
            return
        t_name, t_meta = next(iter(train_uploader.value.items()))
        d_name, d_meta = next(iter(dev_uploader.value.items()))
        TRAIN_JSON_PATH.write_bytes(t_meta["content"])
        DEV_JSON_PATH.write_bytes(d_meta["content"])
        print(f"Saved train.json → {TRAIN_JSON_PATH}")
        print(f"Saved dev.json   → {DEV_JSON_PATH}")

btn = widgets.Button(description="Save files", button_style="success")
btn.on_click(on_save)

display(widgets.HTML("<b>Upload train.json</b>"), train_uploader)
display(widgets.HTML("<b>Upload dev.json</b>"), dev_uploader)
display(btn, out)

## Extract text from pdfs (optional)


In [None]:
import json, re, sys
from pathlib import Path
from pdfminer.layout import LAParams

USE_PYMUPDF = False
try:
    import fitz
    USE_PYMUPDF = True
except Exception:
    pass

def extract_text_pdfplumber(pdf_path):
    import pdfplumber
    text_parts = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text_parts.append(page.extract_text() or "")
    return "\n\n".join(text_parts)

def extract_text_pymupdf(pdf_path):
    text_parts = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text_parts.append(page.get_text("text") or "")
    return "\n\n".join(text_parts)

def pdf_to_text(pdf_path: Path) -> str:
    return extract_text_pdfplumber(str(pdf_path))

def convert_pdfs_to_text(in_dir: str, out_jsonl: str):
    in_path = Path(in_dir)
    pdf_files = sorted(in_path.glob("*.pdf"))
    if not pdf_files:
        print(f"No PDFs found in: {in_dir}")
        sys.exit(1)

    out_path = Path(out_jsonl)
    count_ok, count_empty = 0, 0

    with out_path.open("w", encoding="utf-8") as fw:
        for pdf in pdf_files:
            try:
                txt = pdf_to_text(pdf)
                if not txt or len(txt) < 30:
                    count_empty += 1
                    print(f"[WARN] Very little/empty text in {pdf.name} (skipping)")
                    continue

                # One LS task per resume
                rec = {"text": txt, "meta": {"source": pdf.name}}
                fw.write(json.dumps(rec, ensure_ascii=False) + "\n")
                count_ok += 1
            except Exception as e:
                print(f"[ERROR] {pdf.name}: {e}")

    print(f"Done. Wrote {count_ok} items to {out_jsonl}. Skipped empty: {count_empty}")

convert_pdfs_to_text(str(INPUT_PDF_DIR), str(OUTPUT_JSONL_PATH))

## Step 2: Normalize and clean the data:



In [None]:
import json
import sys

import re

CID = re.compile(r"\(cid:\d+\)")
# replace "-\n" with a space (instead of removing)
DEHYPH_LINE = re.compile(r"-\s*\n\s*")

def clean_text(s: str) -> str:
    # normalize newlines
    s = s.replace("\r\n", "\n").replace("\r", "\n")

    # remove CID artifacts
    s = CID.sub(" ", s)

    # fix words split by newline without hyphen (e.g., "Machine\nLearning")
    s = re.sub(r"([a-z])\n([a-z])", r"\1 \2", s, flags=re.I)

    # collapse multiple spaces/tabs → single space
    s = re.sub(r"[ \t]+", " ", s)

    # collapse 3+ newlines → just 2 newlines
    s = re.sub(r"\n{3,}", "\n\n", s)

    return s.strip()

def apply_clean_text(in_path, out_path):
    with open(in_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    cleaned = []
    for obj in data:
        raw = obj.get("text", "")
        obj["text"] = clean_text(raw)
        cleaned.append(obj)

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(cleaned, f, ensure_ascii=False, indent=2)

    print(f"Cleaned {len(cleaned)} resumes → {out_path}")

apply_clean_text('./parsed/train.json', 'clean/train.json')
apply_clean_text('./parsed/dev.json', 'clean/dev.json')


## Step 3: Make prelabels for resumes to speedup the Annotation process:

In [None]:
import json, re, sys
from pathlib import Path

# --- Regex dictionaries -----------------------------------------
SKILL_TERMS = [
    # Programming languages & core libs
    r"Python", r"Java", r"JavaScript", r"TypeScript", r"C\+\+", r"C#", r"\bC\b", r"Rust", r"Go", r"R",
    r"NumPy", r"Pandas", r"scikit-?learn", r"PyTorch", r"TensorFlow", r"Keras", r"LightGBM",
    r"BeautifulSoup", r"Keras", r"CatBoost", r"Seaborn", r"OpenCV",

    # Visualization & analysis
    r"Matplotlib", r"Seaborn", r"Plotly", r"Tableau", r"Power BI", r"D3\.js",
    r"EDA", r"Exploratory Data Analysis", r"Statistical Analysis", r"Statistics", r"Statistical Modeling",
    r"Data Visualization", r"Data Cleaning", r"Data Preprocessing", r"Data Analysis", r"Business analytics",
    r"Regression Analysis", r"Classification", r"Clustering", r"AB-?Testing", r"A/B Testing",

    # ML & AI concepts
    r"Supervised learning", r"Unsupervised learning", r"Machine Learning",
    r"Deep Learning", r"CNNs?", r"RNNs?", r"Transformers?",
    r"Reinforcement Learning", r"Computer Vision", r"Named Entity Recognition",
    r"Feature Engineering", r"Hyperparameter Tuning", r"Algorithms", r"data structures",
    r"NLP", r"Robotics", r"Ray", r"MLflow", r"JAX", r"Hugging Face", r"spaCy",

    # Model evaluation metrics
    r"Model Evaluation", r"Accuracy", r"Precision", r"Recall", r"F1-?score", r"ROC-?AUC",

    # Backend / web frameworks & patterns
    r"Spring Boot", r"Spring", r"Django", r"Django REST Framework", r"DRF", r"Celery",
    r"Flask", r"REST API", r"GraphQL", r"Apollo",
    r"Node\.js", r"Express(\.js)?", r"NestJS", r"gRPC", r"socket\.io",

    # Frontend ecosystem
    r"React(\.js)?", r"Next\.js", r"Redux", r"React hooks", r"React-?router", r"Angular",
    r"redux-?saga", r"redux-?thunk", r"Effector", r"VueJS?",
    r"HTML5?", r"CSS3?", r"SCSS", r"PostCSS", r"JSS",
    r"CSS Modules", r"BEM", r"CSS-?in-?JS", r"Styled components",
    r"Material UI", r"DOM API", r"Canvas API", r"SVG",
    r"PWA", r"Web Workers", r"Push Notifications", r"IndexedDB",
    r"WebSockets?", r"HTTP", r"SSR",
    r"RxJS", r"UI/UX design principles", r"UX",

    # Mobile Development
    r"Kotlin", r"Swift", r"SwiftUI", r"Firebase",

    # Build & dev tools
    r"Webpack", r"Babel", r"npm", r"yarn", r"Bazel",
    r"ESLint", r"Prettier", r"Storybook", r"Chrome Devtools?", r"Figma", r"PyCharm", r"Jupyter Notebook",
    r"Maven", r"Gradle", r"Jenkins", r"TeamCity", r"Splunk", r"Prometheus", r"Grafana",
    r"GitHub Actions", r"Selenium", r"JMeter", r"Postman",

    # DevOps / CI/CD & Infrastructure
    r"CI/CD", r"Git", r"GitHub", r"Bitbucket", r"OpenShift",
    r"Docker", r"Kubernetes", r"Terraform", r"OpenShift",

    # Cloud & big data stack
    r"AWS", r"GCP", r"Azure",
    r"EMR", r"EC2", r"S3", r"DynamoDB", r"SQS", r"SNS", r"Lambda", r"AWS CDK",
    r"AWS Step Functions", r"AWS Batch", r"Athena",
    r"Elasticsearch", r"Elastic ?Search", r"Kafka", r"Spark", r"Hadoop", r"Hive", r"Presto", r"Druid", r"Zookeeper", r"Qubole",
    r"Airflow", r"BigQuery",

    # Monitoring & Logging
    r"ELK",

    # Security
    r"Kali Linux", r"Snort", r"Wireshark",

    # Robotics & Embedded Systems
    r"ROS", r"Embedded Systems", r"Gazebo",

    # Databases
    r"MySQL", r"DB2", r"MongoDB", r"Databases?", r"NoSQL", r"PostgreSQL", r"Oracle", r"ClickHouse", r"Hazelcast", r"\bSQL\b",

    # General tools & collaboration
    r"Git", r"Linux", r"Jupyter Notebook", r"APIs?", r"Excel",
    r"Jira", r"Confluence", r"Cloud platforms?", r"Docker", r"Bash", r"vim", r"LATEX",

    # Methodologies & practices
    r"Agile", r"Scrum", r"SDLC", r"Microservices",
    r"Microservice architecture", r"Micro-?frontend architecture",
    r"Performance Optimization", r"Web Security", r"SEO", r"Web Accessibility", r"a11y",
    r"OOP", r"SOLID", r"Design patterns", r"Clean Code", r"REST API", r"API development",
    r"Unit tests?", r"Integration tests?", r"e2e tests?", r"Screenshot tests?",
    r"Jest", r"React-?testing-?library", r"Cypress", r"Hermione", r"RAII",
    r"Product Roadmaps", r"API Design",

    # Soft/role-adjacent technical skills (keep for recall)
    r"Client Requirement Scoping",
    r"Cross-?functional Collaboration",
    r"Mentoring",
    r"Problem solving",
    r"Debugging",
]

LANGUAGE_TERMS = [
    r"(?:German|French|Spanish|Russian|English)(?:(?:\s+-\s*|\s+)(?:native|fluent|advanced|intermediate|beginner|basic|proficient|working\s+knowledge))?",
    r"(?:native|fluent|advanced|intermediate|beginner|basic|proficient|working\s+knowledge)(?:\s+of)?(?:\s+(?:German|French|Spanish|Russian|English))",
    r"German(?:\s*\([A-C][12]\)|\s*-\s*(Beginner|Intermediate|Advanced|Fluent)|\s*B[12]|C[12])?",
    r"English(?:\s*\([A-C][12]\)|\s*-\s*(Beginner|Intermediate|Advanced|Advanced|Fluent))?",
    r"French(?:\s*\([A-C][12]\)|\s*-\s*(Beginner|Intermediate|Advanced|Fluent))?",
    r"Russian(?:\s*\([A-C][12]\)|\s*-\s*(Beginner|Intermediate|Advanced|Fluent))?",
    r"Spanish(?:\s*\([A-C][12]\)|\s*-\s*(Beginner|Intermediate|Advanced|Fluent))?"
]

# Cleanup helpers
CID = re.compile(r"\(cid:\d+\)")
DEHYPH = re.compile(r"-\s*\n\s*")

def clean_text(s: str) -> str:
    if not s:
        return ""
    s = CID.sub(" ", s)
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    s = DEHYPH.sub("", s)                  # join hyphenated line breaks
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def find_spans(text: str, patterns, label):
    results = []
    for pat in patterns:
        for m in re.finditer(rf"\b(?:{pat})\b", text, flags=re.IGNORECASE):
            start, end = m.start(), m.end()
            results.append({
                "from_name": "label",
                "to_name": "text",
                "type": "labels",
                "value": {
                    "start": start,
                    "end": end,
                    "text": text[start:end],
                    "labels": [label]
                }
            })
    # dedupe overlaps, keep first
    results.sort(key=lambda r: (r["value"]["start"], r["value"]["end"]))
    dedup, last_end = [], -1
    for r in results:
        s, e = r["value"]["start"], r["value"]["end"]
        if s >= last_end:
            dedup.append(r)
            last_end = e
    return dedup

def make_prediction_for(text: str):
    spans = []
    spans += find_spans(text, SKILL_TERMS, "Skill")
    spans += find_spans(text, LANGUAGE_TERMS, "Language")
    return {"result": spans, "score": 0.3, "model_version": "regex_v1"}

def make_prelabels(in_json: str, out_json: str, with_predictions: bool = True):
    # Read input as JSON array
    data = json.loads(Path(in_json).read_text(encoding="utf-8-sig"))
    if not isinstance(data, list):
        raise ValueError("Input must be a JSON array of objects")

    tasks = []
    for obj in data:
        raw = obj.get("text") or obj.get("data", {}).get("text") or ""
        text = clean_text(raw)
        data_field = {"text": text}
        if "meta" in obj:
            data_field["meta"] = obj["meta"]

        task = {"data": data_field}
        if with_predictions:
            task["predictions"] = [make_prediction_for(text)]
        tasks.append(task)

    # Write a single valid JSON array
    Path(out_json).write_text(json.dumps(tasks, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Wrote {len(tasks)} tasks to {out_json}")

make_prelabels('.\clean\train.json', '.\prelabeled\train.json')
make_prelabels('.\clean\dev.json', '.\prelabeled\dev.json')

## Step 4: Annotate the resumes in Label studio

## Load annotated resumes:

In [None]:
from pathlib import Path
from IPython.display import display
import ipywidgets as widgets

TRAIN_JSON_PATH = Path("./annotated/train.json")
DEV_JSON_PATH = Path("./annotated/dev.json")

train_uploader = widgets.FileUpload(accept=".json", multiple=False)
dev_uploader = widgets.FileUpload(accept=".json", multiple=False)
out = widgets.Output()

def on_save(_):
    with out:
        out.clear_output()
        if not train_uploader.value or not dev_uploader.value:
            print("Please upload both train.json and dev.json")
            return
        t_name, t_meta = next(iter(train_uploader.value.items()))
        d_name, d_meta = next(iter(dev_uploader.value.items()))
        TRAIN_JSON_PATH.write_bytes(t_meta["content"])
        DEV_JSON_PATH.write_bytes(d_meta["content"])
        print(f"Saved train.json → {TRAIN_JSON_PATH}")
        print(f"Saved dev.json   → {DEV_JSON_PATH}")

btn = widgets.Button(description="Save files", button_style="success")
btn.on_click(on_save)

display(widgets.HTML("<b>Upload train.json</b>"), train_uploader)
display(widgets.HTML("<b>Upload dev.json</b>"), dev_uploader)
display(btn, out)

## Install packages:


In [None]:
!pip install spacy==3.7.4 spacy-transformers==1.3.4 transformers==4.36.2

In [None]:
!python -m spacy download en_core_web_trf

## Step 5: Convert data from json to spacy format

In [None]:
import sys, json, srsly, pathlib
from spacy.tokens import DocBin
import spacy

TARGET_LABELS = {"Skill", "Work_Experience", "Education", "Language"}

def iter_tasks(obj):
    """Yield Label Studio task objects from either JSON array or JSONL file."""
    p = pathlib.Path(obj)
    txt = p.read_text(encoding="utf-8-sig")
    if txt.lstrip().startswith("["):
        for rec in json.loads(txt):
            yield rec
    else:
        for line in txt.splitlines():
            line = line.strip()
            if not line:
                continue
            yield json.loads(line)

def extract_spans(task, use_predictions=False):
    """
    Return list of (start, end, label) from a LS task.
    If your export stores gold labels under 'annotations', keep use_predictions=False.
    If yours uses 'predictions', set use_predictions=True.
    """
    key = "predictions" if use_predictions else "annotations"
    spans = []
    for ann in task.get(key, []):
        for r in ann.get("result", []):
            if r.get("type") != "labels":
                continue
            labels = r["value"].get("labels", [])
            if not labels:
                continue
            label = labels[0]
            if label not in TARGET_LABELS:
                continue
            start = r["value"]["start"]
            end = r["value"]["end"]
            spans.append((start, end, label))
    return spans

def make_docbin(nlp, tasks, use_predictions=False):
    db = DocBin(store_user_data=False)
    bad, good = 0, 0
    for t in tasks:
        text = (t.get("data") or {}).get("text") or t.get("text") or ""
        if not text:
            continue
        doc = nlp.make_doc(text)
        ents = []
        spans = extract_spans(t, use_predictions=use_predictions)
        # filter overlapping/invalid spans
        char_taken = [False] * (len(text) + 1)
        for start, end, label in sorted(spans, key=lambda x: (x[0], x[1])):
            if start >= end or end > len(text):
                continue
            if any(char_taken[start:end]):
                continue
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                continue
            for i in range(start, end):
                char_taken[i] = True
            ents.append(span)
        doc.ents = ents
        if ents:
            good += 1
        else:
            bad += 1
        db.add(doc)
    print(f"Built {good} docs with entities; {bad} had none.")
    return db


lang, train_in, dev_in, out_dir = "en", "./annotated/train.json", "./annotated/dev.json", "."
use_pred = False  # Default value

# Create tokenizer base for Doc creation
nlp = spacy.blank("en") if lang == "en" else spacy.blank("de")

# Load the tasks
train_tasks = list(iter_tasks(train_in))
dev_tasks = list(iter_tasks(dev_in))

# Create output directory if it doesn't exist
out = pathlib.Path(out_dir)
out.mkdir(parents=True, exist_ok=True)

# Create DocBin objects
train_db = make_docbin(nlp, train_tasks, use_predictions=use_pred)
dev_db = make_docbin(nlp, dev_tasks, use_predictions=use_pred)

# Save to disk
train_db.to_disk(out / "train.spacy")
dev_db.to_disk(out / "dev.spacy")

print(f"Wrote {out/'train.spacy'} and {out/'dev.spacy'}")

## Step 6: Set up model config:

In [None]:
config_content = """[paths]
train = null
dev = null
vectors = null
init_tok2vec = null

[system]
gpu_allocator = "pytorch"
seed = 0

[nlp]
lang = "en"
pipeline = ["transformer","ner"]
batch_size = 128
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100

[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"

[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = "distilbert-base-uncased"
mixed_precision = false

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.transformer.model.grad_scaler_config]

[components.transformer.model.tokenizer_config]
use_fast = true

[components.transformer.model.transformer_config]

[corpora]

[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null

[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005

[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null

[pretraining]

[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null

[initialize.components]

[initialize.tokenizer]
"""

with open("config.cfg", "w") as f:
    f.write(config_content)

print("Config file generated!")

## Step 7: Train the model:

In [None]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy --gpu-id 0

## Step 8: Save the model to the Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp -r /content/output/model-best /content/drive/MyDrive/resume-parser-models/core-web-lg/