neccessary package install

In [5]:
!pip install -q datasets transformers seqeval sentence-transformers accelerate scikit-learn faiss-cpu evaluate
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
import zipfile
import os

# Unzip Jobdesc.zip
with zipfile.ZipFile('Jobdesc.zip', 'r') as zip_ref:
    zip_ref.extractall('JobDesc')
print(os.listdir('JobDesc'))

# Unzip ResumesJsonAnnotated.zip
with zipfile.ZipFile('ResumeJSONAnnotated.zip', 'r') as zip_ref:
    zip_ref.extractall('ResumeJSONAnnotated')
print(os.listdir('ResumeJSONAnnotated'))

['Technology Skills.txt', 'Alternate Titles.txt', 'Task Ratings.txt', 'Emerging Tasks.txt', 'Education, Training, and Experience Categories.txt', 'Abilities to Work Context.txt', 'Occupation Level Metadata.txt', 'Work Values.txt', 'RIASEC Keywords.txt', 'Education, Training, and Experience.txt', 'Task Categories.txt', 'Job Zone Reference.txt', 'Task Statements.txt', 'Work Styles.txt', 'Abilities.txt', 'Read Me.txt', 'Survey Booklet Locations.txt', 'Skills.txt', 'Skills to Work Context.txt', 'Abilities to Work Activities.txt', 'Work Context Categories.txt', 'Interests Illustrative Occupations.txt', 'IWA Reference.txt', 'Occupation Data.txt', 'Knowledge.txt', 'Related Occupations.txt', 'Level Scale Anchors.txt', 'Scales Reference.txt', 'Interests Illustrative Activities.txt', 'Content Model Reference.txt', 'Tools Used.txt', 'DWA Reference.txt', 'Work Context.txt', 'Job Zones.txt', 'Work Activities.txt', 'UNSPSC Reference.txt', 'Basic Interests to RIASEC.txt', 'Skills to Work Activities.t

In [7]:
import os, json, glob, textwrap

In [8]:
# Kaggle/Dataturks JSON
RESUME_KAGGLE_PATH = 'ResumeNER.json'
# extracted folder from ResumeJSONAnnotated.zip
RESUME_HF_DIR = 'ResumesJsonAnnotated'
# extracted job descriptions
JOBDESC_DIR = 'Jobdesc'

print('Kaggle resume exists:', os.path.exists(RESUME_KAGGLE_PATH))
print('HF resume dir exists:', os.path.exists(RESUME_HF_DIR))
print('Jobdesc dir exists:', os.path.exists(JOBDESC_DIR))

# Show samples
if os.path.exists(RESUME_KAGGLE_PATH):
    with open(RESUME_KAGGLE_PATH, 'r', encoding='utf-8') as f:
        sample = json.loads(f.readline())
    print('\nKaggle sample keys:', list(sample.keys()))
    print('\nSample text (first 400 chars):\n', textwrap.fill(sample.get('content','')[:400], width=120))

if os.path.exists(RESUME_HF_DIR):
    hf_files = glob.glob(os.path.join(RESUME_HF_DIR, '*.json'))[:3]
    print('\nHuggingFace JSON files found (example 3):', hf_files)
    if hf_files:
        with open(hf_files[0], 'r', encoding='utf-8') as f:
            hf_sample = json.load(f)
        print('\nHF sample keys:', list(hf_sample.keys()))
        print('\nHF sample content (first 400 chars):\n', textwrap.fill(hf_sample.get('content','')[:400], width=120))

if os.path.exists(JOBDESC_DIR):
    jd_files = glob.glob(os.path.join(JOBDESC_DIR, '*'))[:5]
    print('\nJobdesc files (example):', jd_files)

Kaggle resume exists: True
HF resume dir exists: False
Jobdesc dir exists: False

Kaggle sample keys: ['content', 'annotation', 'extras']

Sample text (first 400 chars):
 Abhishek Jha Application Development Associate - Accenture  Bengaluru, Karnataka - Email me on Indeed:
indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a  • To work for an organization which provides me the opportunity to improve
my skills and knowledge for my individual and company's growth in best possible ways.  Willing to relocate to:
Bangalore, Karnataka  WORK EXPERIENCE  Application Development Asso


In [9]:
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm

Convert Kaggle/Dataturks format with error handling

In [10]:
def convert_kaggle_dataturks(kaggle_path):
    records = []
    with open(kaggle_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            text = rec.get('content') or rec.get('text') or ''
            entities = []
            for ann in rec.get('annotation', []):
                labels = ann.get('label', [])
                label = labels[0] if labels else 'MISC'
                for p in ann.get('points', []):
                    start = p.get('start'); end = p.get('end')
                    if start is None or end is None:
                        # some variants embed offsets differently; fallback to finding the text
                        span = p.get('text', '').strip()
                        if span and span in text:
                            start = text.index(span)
                            end = start + len(span)
                        else:
                            continue
                    entities.append({'start': start, 'end': end, 'label': label.upper()})
            records.append({'text': text, 'entities': entities})
    return records

Convert HF annotated format with improved error handling

In [11]:
def convert_hf_annotated(hf_dir):
    records = []
    import glob, os
    for p in glob.glob(os.path.join(hf_dir, '*.json')):
        with open(p, 'r', encoding='utf-8') as f:
            rec = json.load(f)
        text = rec.get('content') or rec.get('text') or ''
        entities = []
        # HF variants sometimes use 'annotation' or 'annotations' or 'entities' keys
        ann_key = None
        for k in ['annotation','annotations','entities']:
            if k in rec:
                ann_key = k; break
        if ann_key is None:
            # trying nested structures
            ann_key = 'annotation' if 'annotation' in rec else None
        if ann_key:
            for ann in rec.get(ann_key, []):
                # different formats: ann might be in this format like {label:['X'], points:[{start,end,text}]}
                if 'points' in ann:
                    labels = ann.get('label', [])
                    label = labels[0] if labels else 'MISC'
                    for p in ann['points']:
                        start = p.get('start'); end = p.get('end')
                        if start is None or end is None:
                            span = p.get('text','').strip()
                            if span and span in text:
                                start = text.index(span); end = start + len(span)
                            else:
                                continue
                        entities.append({'start': start, 'end': end, 'label': label.upper()})
                else:
                    # ann may be like this format {start:.., end:.., label:..}
                    start = ann.get('start'); end = ann.get('end'); label = ann.get('label') or ann.get('tag') or 'MISC'
                    if start is None or end is None:
                        continue
                    entities.append({'start': start, 'end': end, 'label': label.upper()})
        records.append({'text': text, 'entities': entities})
    return records

Data Processing ⬇

In [12]:
# Run conversion on the provided paths (if they exist)
kaggle_records = []
hf_records = []
if os.path.exists(RESUME_KAGGLE_PATH):
    kaggle_records = convert_kaggle_dataturks(RESUME_KAGGLE_PATH)
    print('Converted Kaggle records:', len(kaggle_records))
if os.path.exists(RESUME_HF_DIR):
    hf_records = convert_hf_annotated(RESUME_HF_DIR)
    print('Converted HF records:', len(hf_records))

Converted Kaggle records: 220


In [13]:
# Inspect a sample mapping
if kaggle_records:
    print('\nKaggle sample entities:', kaggle_records[0]['entities'][:5])
if hf_records:
    print('\nHF sample entities:', hf_records[0]['entities'][:5])


Kaggle sample entities: [{'start': 1295, 'end': 1621, 'label': 'SKILLS'}, {'start': 993, 'end': 1153, 'label': 'SKILLS'}, {'start': 939, 'end': 956, 'label': 'COLLEGE NAME'}, {'start': 883, 'end': 904, 'label': 'COLLEGE NAME'}, {'start': 856, 'end': 860, 'label': 'GRADUATION YEAR'}]


Convert entity spans to BIO format with improved alignment

In [14]:
def spans_to_bio(text, entities):
    try:
        tokens = word_tokenize(text)
        if not tokens:
            return [], []
        # Compute character offsets for tokens
        offsets = []
        cursor = 0
        for token in tokens:
            idx = text.find(token, cursor)
            if idx == -1:
                # Fallback for tokenization mismatches
                idx = cursor
                cursor += 1
            else:
                cursor = idx + len(token)
            offsets.append((idx, idx + len(token)))
        labels = ['O'] * len(tokens)

        for ent in entities:
            s, e, lab = ent['start'], ent['end'], ent['label']
            start_idx = None
            end_idx = None
            # Token overlaps with entity span
            for i, (a, b) in enumerate(offsets):
                if b > s and a < e:
                    if start_idx is None:
                        start_idx = i
                    end_idx = i
            if start_idx is not None:
                labels[start_idx] = 'B-' + lab
                for j in range(start_idx + 1, end_idx + 1):
                    if j < len(labels):
                        labels[j] = 'I-' + lab

        return tokens, labels
    except Exception as e:
        print(f"Error in BIO tagging: {e}")
        return [], []

In [15]:
# Apply to combined records
combined_examples = []
for rec in kaggle_records + hf_records:
    text = rec['text']
    ents = rec['entities']
    toks, labs = spans_to_bio(text, ents)
    if len(toks) == 0:
        continue
    combined_examples.append({'tokens': toks, 'labels': labs, 'text': text})

In [16]:
print('Total tokenized examples:', len(combined_examples))
print('Sample tokens:', combined_examples[0]['tokens'][:20])
print('Sample labels:', combined_examples[0]['labels'][:20])

Total tokenized examples: 220
Sample tokens: ['Abhishek', 'Jha', 'Application', 'Development', 'Associate', '-', 'Accenture', 'Bengaluru', ',', 'Karnataka', '-', 'Email', 'me', 'on', 'Indeed', ':', 'indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a', '•', 'To', 'work']
Sample labels: ['B-NAME', 'I-NAME', 'B-DESIGNATION', 'I-DESIGNATION', 'I-DESIGNATION', 'O', 'B-COMPANIES WORKED AT', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'B-EMAIL ADDRESS', 'I-EMAIL ADDRESS', 'I-EMAIL ADDRESS', 'O', 'O', 'O']


Label Normalisatoin

In [17]:
label_map = {
    'NAME':'NAME','PERSON':'NAME',
    'EMAIL ADDRESS':'EMAIL','EMAIL':'EMAIL','E-MAIL':'EMAIL',
    'PHONE':'PHONE','MOBILE':'PHONE','CONTACT':'PHONE',
    'COMPANIES WORKED AT':'ORG','COMPANY':'ORG','ORGANIZATION':'ORG',
    'DESIGNATION':'TITLE','TITLE':'TITLE','ROLE':'TITLE',
    'SKILLS':'SKILL','TECHNICAL SKILLS':'SKILL','SKILL':'SKILL',
    'COLLEGE NAME':'ORG','DEGREE':'DEGREE','GRADUATION YEAR':'DATE',
    'LOCATION':'LOCATION','ADDRESS':'LOCATION'
}

Normalize entity labels to standard format

In [18]:
def normalize_label(lab):
    if lab == 'O':
        return 'O'
    if '-' in lab:
        pref, ent = lab.split('-',1)
    else:
        pref, ent = 'B', lab
    ent = ent.strip().upper()
    ent_norm = label_map.get(ent, ent)
    return pref + '-' + ent_norm

In [19]:
for ex in combined_examples:
    ex['labels'] = [normalize_label(l) for l in ex['labels']]

# list labels used
all_labels = sorted({lab for ex in combined_examples for lab in ex['labels'] if lab!='O'})
print('Entity labels found:', all_labels)

Entity labels found: ['B-DATE', 'B-DEGREE', 'B-EMAIL', 'B-LOCATION', 'B-MISC', 'B-NAME', 'B-ORG', 'B-SKILL', 'B-TITLE', 'B-UNKNOWN', 'B-YEARS OF EXPERIENCE', 'I-DEGREE', 'I-EMAIL', 'I-LOCATION', 'I-NAME', 'I-ORG', 'I-SKILL', 'I-TITLE', 'I-UNKNOWN', 'I-YEARS OF EXPERIENCE']


Loading data

In [20]:
from pathlib import Path
import json

In [21]:
out_path = 'ner_unified.jsonl'
with open(out_path, 'w', encoding='utf-8') as f:
    for ex in combined_examples:
        # keep tokens and labels
        f.write(json.dumps({'tokens': ex['tokens'], 'labels': ex['labels'], 'text': ex['text']}) + '\n')
print('Saved', out_path)

Saved ner_unified.jsonl


load creates dataset with 'tokens','labels','text' columns

In [22]:
# Load with datasets (token classification expects tokens or words)
from datasets import load_dataset
ds = load_dataset('json', data_files=out_path, field='data' if False else None)  # fallback to simple load
print(ds)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'text'],
        num_rows: 220
    })
})


Model Setup

In [23]:
from transformers import AutoTokenizer
model_checkpoint = 'distilbert-base-cased'  # smaller/faster; switch to bert-base-cased if you prefer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Labl mapping creation

In [24]:
label_set = set()
dataset_key = 'train' if 'train' in ds else list(ds.keys())[0]

for labels_list in ds[dataset_key]['labels']:
    for label in labels_list:
        label_set.add(label)

label_list = ['O'] + sorted([l for l in label_set if l != 'O'])
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for l, i in label2id.items()}

print('Number of labels:', len(label_list))
print('Label list:', label_list[:10])  # Show first 10

Number of labels: 21
Label list: ['O', 'B-DATE', 'B-DEGREE', 'B-EMAIL', 'B-LOCATION', 'B-MISC', 'B-NAME', 'B-ORG', 'B-SKILL', 'B-TITLE']


In [25]:
# Tokenize & align (works on 'tokens' using is_split_into_words=True)
def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation=True, padding='max_length', max_length=256)
    labels = []
    for i, lab in enumerate(batch['labels']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id.get(lab[word_idx], label2id['O']))
            else:
                # For tokens created from the same word, use -100 to ignore or use I- prefix
                # We'll set -100 to ignore sub-tokens
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs['labels'] = labels
    return tokenized_inputs

Convert to dataset object that Trainer expects

In [26]:
try:
    tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
    print("Tokenization successful")
    print(tokenized_ds)
except Exception as e:
    print(f"Tokenization error: {e}")

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Tokenization successful
DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels', 'text', 'input_ids', 'attention_mask'],
        num_rows: 220
    })
})


In [27]:
!pip install evaluate



In [28]:
!pip show transformers

Name: transformers
Version: 4.55.4
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


Training Setup

In [29]:
from transformers import AutoModelForTokenClassification, Trainer, DataCollatorForTokenClassification
from transformers import TrainingArguments
from evaluate import load
from sklearn.model_selection import train_test_split
import numpy as np

In [30]:
# Split data for training and validation
dataset_key = list(tokenized_ds.keys())[0]
full_dataset = tokenized_ds[dataset_key]

In [31]:
# Create train/validation split
train_indices, val_indices = train_test_split(
    range(len(full_dataset)),
    test_size=0.2,
    random_state=42
)

In [32]:
train_dataset = full_dataset.select(train_indices)
val_dataset = full_dataset.select(val_indices)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

Training samples: 176
Validation samples: 44


model init

In [33]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detect if running in Colab or local environment

In [36]:
def get_base_path():
    if '/content' in os.getcwd() or os.path.exists('/content'):
        return '/content'
    return os.getcwd()

BASE_PATH = get_base_path()

In [49]:
args = TrainingArguments(
    output_dir=os.path.join(BASE_PATH, 'NER_Resume_Model'),
    learning_rate=0.01,
    per_device_train_batch_size=30,
    per_device_eval_batch_size=30,
    num_train_epochs=5,
    weight_decay=0.1,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_steps=50,
    push_to_hub=False,
    load_best_model_at_end=True,
    optim="adamw_torch"
)

In [50]:
from evaluate import load
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load('seqeval')

In [43]:
from collections import Counter
import numpy as np

# Check class distribution in your training data
def check_class_distribution(dataset):
    all_labels = []
    for example in dataset:
        labels = [label for label in example['labels'] if label != -100]
        all_labels.extend(labels)

    label_counts = Counter(all_labels)
    total = sum(label_counts.values())

    print("Class distribution:")
    for label_id, count in label_counts.items():
        print(f"{id2label[label_id]}: {count} ({count/total*100:.2f}%)")

    return label_counts

check_class_distribution(train_dataset)

Class distribution:
B-NAME: 175 (0.64%)
I-NAME: 173 (0.64%)
O: 23365 (85.92%)
B-EMAIL: 158 (0.58%)
B-YEARS OF EXPERIENCE: 31 (0.11%)
B-TITLE: 265 (0.97%)
I-TITLE: 512 (1.88%)
B-LOCATION: 254 (0.93%)
I-EMAIL: 66 (0.24%)
B-SKILL: 75 (0.28%)
I-SKILL: 979 (3.60%)
B-ORG: 351 (1.29%)
B-DEGREE: 74 (0.27%)
I-DEGREE: 160 (0.59%)
I-ORG: 449 (1.65%)
B-DATE: 64 (0.24%)
I-LOCATION: 11 (0.04%)
I-YEARS OF EXPERIENCE: 33 (0.12%)


Counter({6: 175,
         15: 173,
         0: 23365,
         3: 158,
         11: 31,
         9: 265,
         18: 512,
         4: 254,
         13: 66,
         8: 75,
         17: 979,
         7: 351,
         2: 74,
         12: 160,
         16: 449,
         1: 64,
         14: 11,
         20: 33})

Compute evaluation metrics

In [51]:
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=2)
    true_labels = [[id2label[l] for l in lab if l!=-100] for lab in labels]
    true_preds = []
    for i, lab in enumerate(labels):
        lab_ids = lab
        pred_ids = preds[i]
        cur_preds = []
        for j, lab_id in enumerate(lab_ids):
            if lab_id == -100:
                continue
            cur_preds.append(id2label[pred_ids[j]])
        true_preds.append(cur_preds)
    results = metric.compute(predictions=true_preds, references=true_labels)
    # flatten and return macro-F1 etc.
    return {
        'precision': results.get('overall_precision'),
        'recall': results.get('overall_recall'),
        'f1': results.get('overall_f1'),
        'accuracy': results.get('overall_accuracy')
    }

Trainer init

In [52]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


Training Start

In [48]:
trainer.train()
trainer.save_model('NER_Resume_Model')

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.837379,0.0,0.0,0.0,0.000584
2,No log,1.043439,0.0,0.0,0.0,0.855036
3,No log,0.800066,0.0,0.0,0.0,0.855036
4,No log,0.776743,0.0,0.0,0.0,0.855036
5,2.317200,0.777737,0.0,0.0,0.0,0.855036


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Load and test the model

In [None]:
from transformers import pipeline
ner_pipe = pipeline('ner', model='NER_Resume_Model', tokenizer=tokenizer, grouped_entities=True)

if combined_examples:
  sample_text = combined_examples[0]['text'][:800]
  print("Sample text:")
  print(sample_text[:200] + "...")
  print("\nNER Results:")
  print(ner_pipe(sample_text))
else:
  print("No sample data available for testing")

In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
sbert = SentenceTransformer('all-MiniLM-L6-v2')

Build a small resumes representation (skills text + summary). We'll extract 'skills' heuristically from NER SKILL labels.

In [None]:
def extract_skills_from_text(text):
    # naive approach: look for lines after 'SKILLS' or 'TECHNICAL SKILLS'
    m = re.search(r'(SKILLS|TECHNICAL SKILLS|CORE COMPETENCIES):?\s*(.+)(?:\n\n|\n[A-Z]|\Z)', text, re.IGNORECASE|re.DOTALL)
    if m:
        skills_text = match.group(1).strip()
        # Clean up and limit length
        skills_text = re.sub(r'\n+', ' ', skills_text)
        return skills_text[:500]
    else:
        # fallback: return last 200 chars as 'summary'
        skills_text = text[-400:]
    return skills_text

Resume embedding dataset build

In [None]:
resume_texts = []
for ex in combined_examples:
    try:
        skills = extract_skills_from_text(ex['text'])
        resume_texts.append(skills if skills.strip() else ex['text'][:400])
    except Exception:
        resume_texts.append(ex['text'][:400])

print('Built', len(resume_texts), 'resume text items')

In [None]:
# Load job descriptions from JOBDESC_DIR (simple text files or JSONs)
job_texts = []
if os.path.exists(JOBDESC_DIR):
    for p in glob.glob(os.path.join(JOBDESC_DIR, '*')):
        try:
            with open(p, 'r', encoding='utf-8') as f:
                job_texts.append(f.read()[:2000])
        except Exception:
            # try json
            j = json.load(open(p,'r',encoding='utf-8'))
            job_texts.append(j.get('description', j.get('text', ''))[:2000])

print('Loaded', len(job_texts), 'job descriptions')

In [None]:
# Encode (this may take ~1-2 min for moderate sizes)
resume_embs = sbert.encode(resume_texts, convert_to_tensor=True, show_progress_bar=True)
print("Resume embeddings created successfully")

In [None]:
job_embs = sbert.encode(job_texts, convert_to_tensor=True, show_progress_bar=True)
print("Job embeddings created successfully")

In [None]:
# Matching demo: top-3 jobs for first 5 resumes
if job_embs is not None and len(job_embs)>0:
    for i in range(min(5, len(resume_embs))):
        sims = util.cos_sim(resume_embs[i], job_embs)[0]
        topk = np.argpartition(-sims.cpu().numpy(), range(min(3, len(sims))))[:3]
        print('\nResume', i, 'top matches:')
        for idx in topk:
            print('  job', idx, 'score', float(sims[idx]))

print('\nJob matching cell ready. Tune with skill normalization and exact-skill overlap for better results.')

In [None]:
# Save the notebook file
nb['cells'] = [nbf.v4.new_markdown_cell(title)] + nb['cells'] if False else nb['cells']
notebook_path = 'ResumeJobMatch.ipynb'
nbf.write(nb, notebook_path)
print('Notebook written to:', notebook_path)
print('You can download it from this environment or upload to your Google Drive to open in Colab.')

In [None]:
!pip install evaluate