In [5]:
import json

# Step 1: Load the newline-delimited JSON file
data = []
with open("Entity Recognition in Resumes.json", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

print("Total resumes:", len(data))
print("Sample text:\n", data[0]["content"][:300])
print("Sample annotations:\n", data[0].get("annotation", [])[:2])


Total resumes: 220
Sample text:
 Abhishek Jha
Application Development Associate - Accenture

Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a

• To work for an organization which provides me the opportunity to improve my skills
and knowledge for my individual and company's growth in best possibl
Sample annotations:
 [{'label': ['Skills'], 'points': [{'start': 1295, 'end': 1621, 'text': '\n• Programming language: C, C++, Java\n• Oracle PeopleSoft\n• Internet Of Things\n• Machine Learning\n• Database Management System\n• Computer Networks\n• Operating System worked on: Linux, Windows, Mac\n\nNon - Technical Skills\n\n• Honest and Hard-Working\n• Tolerant and Flexible to Different Situations\n• Polite and Calm\n• Team-Player'}]}, {'label': ['Skills'], 'points': [{'start': 993, 'end': 1153, 'text': 'C (Less than 1 year), Database (Less than 1 year), Database Management (Less than 1 year),\nDatabase Management System (Less than 1 year), Java (Less than 1

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

resume_text = data[0]["content"]
annotations = data[0].get("annotation", [])

# Flatten all points into (start, end, label) tuples
entities = []
for ann in annotations:
    for point in ann['points']:
        entities.append((point['start'], point['end'], ann['label'][0]))

# Tokenize the full text with offsets
tokens_with_offsets = tokenizer(resume_text, return_offsets_mapping=True, truncation=True)

# Show first 10 tokens with offsets
for token, offset in zip(tokens_with_offsets.tokens(), tokens_with_offsets["offset_mapping"]):
    print(f"{token:15} -> {offset}")


[CLS]           -> (0, 0)
A               -> (0, 1)
##b             -> (1, 2)
##his           -> (2, 5)
##he            -> (5, 7)
##k             -> (7, 8)
J               -> (9, 10)
##ha            -> (10, 12)
Application     -> (13, 24)
Development     -> (25, 36)
Associate       -> (37, 46)
-               -> (47, 48)
A               -> (49, 50)
##cc            -> (50, 52)
##ent           -> (52, 55)
##ure           -> (55, 58)
Bengal          -> (60, 66)
##uru           -> (66, 69)
,               -> (69, 70)
Karnataka       -> (71, 80)
-               -> (81, 82)
Em              -> (83, 85)
##ail           -> (85, 88)
me              -> (89, 91)
on              -> (92, 94)
Indeed          -> (95, 101)
:               -> (101, 102)
indeed          -> (103, 109)
.               -> (109, 110)
com             -> (110, 113)
/               -> (113, 114)
r               -> (114, 115)
/               -> (115, 116)
A               -> (116, 117)
##b             -> (117, 118)
##his         

In [7]:
aligned_tokens = []
aligned_labels = []

for token, (tok_start, tok_end) in zip(tokens_with_offsets.tokens(), tokens_with_offsets["offset_mapping"]):
    if token in ["[CLS]", "[SEP]"]:
        continue  # Skip special tokens

    matched_label = "O"

    for ent_start, ent_end, ent_label in entities:
        if tok_start < ent_end and tok_end > ent_start:
            if tok_start == ent_start:
                matched_label = "B-" + ent_label
            else:
                matched_label = "I-" + ent_label
            break

    aligned_tokens.append(token)
    aligned_labels.append(matched_label)
for token, label in zip(aligned_tokens[:30], aligned_labels[:30]):
    print(f"{token:15} → {label}")


A               → B-Name
##b             → I-Name
##his           → I-Name
##he            → I-Name
##k             → I-Name
J               → I-Name
##ha            → I-Name
Application     → B-Designation
Development     → I-Designation
Associate       → I-Designation
-               → O
A               → B-Companies worked at
##cc            → I-Companies worked at
##ent           → I-Companies worked at
##ure           → I-Companies worked at
Bengal          → B-Location
##uru           → I-Location
,               → O
Karnataka       → O
-               → O
Em              → O
##ail           → O
me              → O
on              → O
Indeed          → B-Email Address
:               → I-Email Address
indeed          → I-Email Address
.               → I-Email Address
com             → I-Email Address
/               → I-Email Address


In [8]:
unique_labels = sorted(set(aligned_labels))
print(unique_labels)


['B-College Name', 'B-Companies worked at', 'B-Designation', 'B-Email Address', 'B-Graduation Year', 'B-Location', 'B-Name', 'B-Skills', 'I-College Name', 'I-Companies worked at', 'I-Designation', 'I-Email Address', 'I-Location', 'I-Name', 'I-Skills', 'O']


In [9]:
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}
print(label2id)


{'B-College Name': 0, 'B-Companies worked at': 1, 'B-Designation': 2, 'B-Email Address': 3, 'B-Graduation Year': 4, 'B-Location': 5, 'B-Name': 6, 'B-Skills': 7, 'I-College Name': 8, 'I-Companies worked at': 9, 'I-Designation': 10, 'I-Email Address': 11, 'I-Location': 12, 'I-Name': 13, 'I-Skills': 14, 'O': 15}


In [10]:
label_ids = [label2id[label] for label in aligned_labels]
print(label_ids[:30])

[6, 13, 13, 13, 13, 13, 13, 2, 10, 10, 15, 1, 9, 9, 9, 5, 12, 15, 15, 15, 15, 15, 15, 15, 3, 11, 11, 11, 11, 11]


In [11]:
all_samples = []
all_labels = set()

for item in data:
    resume_text = item["content"]
    annotations = item.get("annotation", [])

    entities = []
    for ann in annotations:
        label = ann.get("label")
        if label and len(label) > 0:
            for point in ann["points"]:
                # Skip noisy long email text (e.g. Indeed links)
                if "@" in point["text"] and len(point["text"]) > 50:
                    continue
                entities.append((point["start"], point["end"], label[0]))

    tokens_with_offsets = tokenizer(resume_text, return_offsets_mapping=True, truncation=True)

    aligned_tokens = []
    aligned_labels = []

    for token, (tok_start, tok_end) in zip(tokens_with_offsets.tokens(), tokens_with_offsets["offset_mapping"]):
        if token in ["[CLS]", "[SEP]"]:
            continue
        matched_label = "O"
        for ent_start, ent_end, ent_label in entities:
            if tok_start < ent_end and tok_end > ent_start:
                matched_label = "B-" + ent_label if tok_start == ent_start else "I-" + ent_label
                break
        aligned_tokens.append(token)
        aligned_labels.append(matched_label)
        all_labels.add(matched_label)

    # ✅ Skip samples with no useful labels
    if not any(l != "O" for l in aligned_labels):
        continue

    if aligned_tokens and len(aligned_tokens) == len(aligned_labels):
        input_ids = tokenizer.convert_tokens_to_ids(aligned_tokens)
        attention_mask = [1] * len(input_ids)
        all_samples.append({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": aligned_labels
        })


In [12]:
unique_labels = sorted(all_labels)
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
print("Unique labels:", unique_labels)

Unique labels: ['B-College Name', 'B-Companies worked at', 'B-Degree', 'B-Designation', 'B-Email Address', 'B-Graduation Year', 'B-Location', 'B-Name', 'B-Skills', 'B-Years of Experience', 'I-College Name', 'I-Companies worked at', 'I-Degree', 'I-Designation', 'I-Email Address', 'I-Graduation Year', 'I-Location', 'I-Name', 'I-Skills', 'I-Years of Experience', 'O']


In [13]:
for sample in all_samples:
    sample["labels"] = [label2id[label] for label in sample["labels"]]
print("Sample input_ids:", all_samples[0]["input_ids"][:30])

Sample input_ids: [138, 1830, 27516, 4638, 1377, 147, 2328, 22491, 3273, 9666, 118, 138, 19515, 3452, 3313, 7756, 12328, 117, 12247, 118, 18653, 11922, 1143, 1113, 10364, 131, 5750, 119, 3254, 120]


In [14]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

train_data, val_data = train_test_split(all_samples, test_size=0.2, random_state=42)
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset))
print(train_dataset[0])

Train dataset size: 176
Validation dataset size: 44
{'input_ids': [156, 10131, 1179, 24930, 1742, 118, 18653, 11922, 1143, 1113, 10364, 131, 5750, 119, 3254, 120, 187, 120, 156, 10131, 1179, 118, 24930, 1742, 120, 124, 1161, 23249, 1477, 1161, 1559, 1830, 1559, 26752, 1545, 1161, 1559, 22433, 794, 5823, 125, 12577, 1116, 119, 1104, 4600, 1250, 2541, 1107, 13795, 5758, 1111, 6082, 113, 9059, 117, 8410, 114, 1105, 27833, 2394, 794, 13719, 118, 1113, 2541, 1114, 5537, 1216, 1112, 25327, 21906, 2737, 4184, 117, 9190, 5954, 4487, 2772, 117, 1130, 2137, 26643, 117, 1105, 6525, 7921, 18195, 794, 15843, 1107, 3780, 1822, 120, 1344, 118, 6448, 1126, 12512, 2913, 7700, 16548, 1116, 117, 1105, 4795, 5611, 1111, 4683, 1606, 5250, 2430, 2340, 2624, 5537, 794, 5823, 1363, 3044, 1113, 4297, 1329, 2740, 117, 4795, 2801, 117, 111, 20122, 1116, 794, 13719, 118, 1113, 2541, 1114, 145, 19974, 2162, 1571, 111, 24821, 1708, 1495, 117, 1259, 2771, 118, 19089, 25400, 794, 13719, 118, 1113, 2541, 1107, 3780, 1

In [15]:
#Load the model
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="ner_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    report_to="none"
)




In [17]:
from transformers import Trainer
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator  
)


trainer.train()





Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [18]:
# Step 5: Save the processed data
model.save_pretrained("ner_resume_model")
tokenizer.save_pretrained("ner_resume_model")


('ner_resume_model\\tokenizer_config.json',
 'ner_resume_model\\special_tokens_map.json',
 'ner_resume_model\\vocab.txt',
 'ner_resume_model\\added_tokens.json',
 'ner_resume_model\\tokenizer.json')

In [19]:
import torch
import random

# Pick a random resume
random_item = random.choice(data)
text = random_item["content"]

# Tokenize with offset mapping
tokens = tokenizer(text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
offsets = tokens["offset_mapping"][0]

# Inference
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2)[0]

# Convert predictions to labels
predicted_labels = [id2label[p.item()] for p in predictions]

# Combine tokens + offsets + labels
for token, (start, end), label in zip(tokenizer.convert_ids_to_tokens(input_ids[0]), offsets, predicted_labels):
    if label != "O":
        print(f"{text[start:end]} → {label}")


In [20]:
custom_text = custom_text = data[10]["content"]



tokens = tokenizer(custom_text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
offsets = tokens["offset_mapping"][0]

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2)[0]

predicted_labels = [id2label[p.item()] for p in predictions]

for token, (start, end), label in zip(tokenizer.convert_ids_to_tokens(input_ids[0]), offsets, predicted_labels):
    word_piece = custom_text[start:end]
    print(f"{token:15} | {word_piece:20} | {label}")



[CLS]           |                      | O
As              | As                   | O
##ish           | ish                  | O
Rat             | Rat                  | O
##ha            | ha                   | O
Sub             | Sub                  | O
##ject          | ject                 | O
matter          | matter               | O
Ex              | Ex                   | O
##pert          | pert                 | O
-               | -                    | O
A               | A                    | O
##cc            | cc                   | O
##ent           | ent                  | O
##ure           | ure                  | O
Chennai         | Chennai              | O
,               | ,                    | O
Tamil           | Tamil                | O
Nadu            | Nadu                 | O
-               | -                    | O
Em              | Em                   | O
##ail           | ail                  | O
me              | me                   | O
on         

In [21]:
# Group token-level BIO predictions into full entities
entities = []
current_entity = ""
current_label = None

for (start, end), label in zip(offsets, predicted_labels):
    word = custom_text[start:end]
    if label.startswith("B-"):
        # Save previous if exists
        if current_entity:
            entities.append((current_entity.strip(), current_label))
        current_entity = word
        current_label = label[2:]  # Remove B-
    elif label.startswith("I-") and current_label == label[2:]:
        current_entity += " " + word
    else:
        if current_entity:
            entities.append((current_entity.strip(), current_label))
            current_entity = ""
            current_label = None

# Add last entity if needed
if current_entity:
    entities.append((current_entity.strip(), current_label))

# Print results
for ent, label in entities:
    print(f"{ent:30} → {label}")

print(set(predicted_labels))


{'O', 'I-Email Address'}


In [22]:
from collections import Counter

flat_labels = []
for sample in all_samples:
    flat_labels.extend(sample["labels"])

flat_labels_named = [id2label[i] for i in flat_labels]
print(Counter(flat_labels_named))


Counter({'O': 80353, 'I-Email Address': 5688, 'I-Skills': 4858, 'I-Designation': 1388, 'I-Companies worked at': 1025, 'I-College Name': 881, 'I-Name': 864, 'I-Degree': 597, 'I-Location': 428, 'B-Companies worked at': 414, 'B-Designation': 409, 'B-Location': 353, 'B-Name': 221, 'B-Email Address': 202, 'B-Skills': 159, 'B-College Name': 157, 'B-Degree': 141, 'B-Graduation Year': 119, 'I-Years of Experience': 82, 'B-Years of Experience': 27, 'I-Graduation Year': 16})


In [None]:
%pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py): started
  Building wheel for seqeval (setup.py): finished with status 'done'
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16247 sha256=9b9b336b851b6b0ea99c5d54eff02381a845d5b90e5d304f9c6c4e7d57723746
  Stored in directory: c:\users\rouna\appdata\local\pip\cache\wheels\5f\b8\73\0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [23]:
from seqeval.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=-1)

    true_labels = []
    true_predictions = []

    for pred, label in zip(predictions, labels):
        temp_labels = []
        temp_preds = []
        for p_i, l_i in zip(pred, label):
            if l_i != -100:  # ignore padded values
                temp_labels.append(id2label[l_i])
                temp_preds.append(id2label[p_i])
        true_labels.append(temp_labels)
        true_predictions.append(temp_preds)

    print(classification_report(true_labels, true_predictions))
    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # ✅ New line here
)
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

       College Name       0.09      0.29      0.14        24
Companies worked at       0.35      0.33      0.34        82
             Degree       0.37      0.26      0.30        27
        Designation       0.48      0.44      0.46        97
      Email Address       0.25      0.31      0.28        51
    Graduation Year       0.00      0.00      0.00        25
           Location       0.57      0.48      0.53        64
               Name       0.64      0.63      0.64        46
             Skills       0.04      0.09      0.05        35
Years of Experience       0.20      0.20      0.20         5

          micro avg       0.33      0.36      0.34       456
          macro avg       0.30      0.30      0.29       456
       weighted avg       0.37      0.36      0.36       456



{'eval_loss': 0.36635497212409973,
 'eval_model_preparation_time': 0.0009,
 'eval_precision': 0.32669322709163345,
 'eval_recall': 0.35964912280701755,
 'eval_f1': 0.3423799582463466,
 'eval_runtime': 11.8277,
 'eval_samples_per_second': 3.72,
 'eval_steps_per_second': 0.507}

In [None]:
custom_text = """
Rounak Laddha  
Machine Learning Engineer - Walmart  
Chicago, IL  
Email: rounak@example.com  
Skills: Python, SQL, Pandas, NumPy
"""


tokens = tokenizer(custom_text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
offsets = tokens["offset_mapping"][0]

model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2)[0]

predicted_labels = [id2label[p.item()] for p in predictions]

# Group BIO labels into full spans
entities = []
current_entity = ""
current_label = None

for (start, end), label in zip(offsets, predicted_labels):
    word = custom_text[start:end]
    if label.startswith("B-"):
        if current_entity:
            entities.append((current_entity.strip(), current_label))
        current_entity = word
        current_label = label[2:]
    elif label.startswith("I-") and current_label == label[2:]:
        current_entity += " " + word
    else:
        if current_entity:
            entities.append((current_entity.strip(), current_label))
            current_entity = ""
            current_label = None

if current_entity:
    entities.append((current_entity.strip(), current_label))

print(predicted_labels[:30])


['B-Name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
custom_text = data[42]["content"]  # or use any resume-style text

# Step 1: Tokenize with offset mapping
tokens = tokenizer(custom_text, return_offsets_mapping=True, return_tensors="pt", truncation=True)
input_ids = tokens["input_ids"]
attention_mask = tokens["attention_mask"]
offsets = tokens["offset_mapping"][0]

# Step 2: Predict token-level labels
model.eval()
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2)[0]

predicted_labels = [id2label[p.item()] for p in predictions]

# ✅ Step 3: Diagnostic print to check what’s going on
print("\n🧪 Token-Level Prediction Preview:")
for token, (start, end), label in zip(tokenizer.convert_ids_to_tokens(input_ids[0]), offsets, predicted_labels):
    word = custom_text[start:end].strip()
    print(f"{token:15} | {word:20} | {label}")

# ✅ Step 4: Group BIO labels into full entity spans
entities = []
current_entity = ""
current_label = None

for (start, end), label in zip(offsets, predicted_labels):
    word = custom_text[start:end].strip()
    if not word:  # if offset gives blank, fallback to token itself
        continue

    if label.startswith("B-"):
        if current_entity:
            entities.append((current_entity.strip(), current_label))
        current_entity = word
        current_label = label[2:]

    elif label.startswith("I-") and current_label == label[2:]:
        current_entity += " " + word

    else:
        if current_entity:
            entities.append((current_entity.strip(), current_label))
            current_entity = ""
            current_label = None

# Add final entity if one was being built
if current_entity:
    entities.append((current_entity.strip(), current_label))

# ✅ Step 5: Print final extracted entities
print("\n📄 Extracted Entities:")
if entities:
    for ent, label in entities:
        print(f"{ent:30} → {label}")
else:
    print("⚠️ No entities extracted. Try another input or check predictions above.")



🧪 Token-Level Prediction Preview:
[CLS]           |                      | B-Name
Man             | Man                  | O
##ish           | ish                  | O
##a             | a                    | O
B               | B                    | I-Name
##hart          | hart                 | I-Name
##i             | i                    | O
Software        | Software             | O
Auto            | Auto                 | I-Designation
##mation        | mation               | I-Designation
Engineer        | Engineer             | I-Designation
Pune            | Pune                 | B-Location
,               | ,                    | O
Maharashtra     | Maharashtra          | O
-               | -                    | O
Em              | Em                   | O
##ail           | ail                  | O
me              | me                   | O
on              | on                   | O
Indeed          | Indeed               | O
:               | :                    | O
in

In [None]:
from collections import Counter
import json

label_counter = Counter()
with open("Entity Recognition in Resumes.json", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= 100: break
        item = json.loads(line)
        for ann in item.get("annotation", []):
            label_counter.update(ann.get("label", []))

print(label_counter.most_common())


[('Companies worked at', 463), ('Skills', 360), ('Designation', 278), ('Location', 223), ('College Name', 189), ('Graduation Year', 181), ('Degree', 156), ('Email Address', 113), ('Name', 101), ('Years of Experience', 22), ('UNKNOWN', 1)]


In [32]:
def align_tokens_with_labels(text, annotations, tokenizer):
    # Extract entity spans from annotations
    entities = []
    for ann in annotations:
        for point in ann['points']:
            start = point['start']
            end = point['end']
            label = ann['label'][0]
            entities.append((start, end, label))
    
    # Tokenize text with offsets
    tokens_data = tokenizer(text, return_offsets_mapping=True, truncation=True)
    input_tokens = tokens_data.tokens()
    offsets = tokens_data["offset_mapping"]

    aligned_tokens = []
    aligned_labels = []

    for token, (tok_start, tok_end) in zip(input_tokens, offsets):
        if token in ["[CLS]", "[SEP]"] or tok_start == tok_end:
            continue  # Skip special and null-offset tokens

        label = "O"
        for ent_start, ent_end, ent_label in entities:
            if tok_start == ent_start and tok_end <= ent_end:
                label = "B-" + ent_label
                break
            elif tok_start > ent_start and tok_end <= ent_end:
                label = "I-" + ent_label
                break

        aligned_tokens.append(token)
        aligned_labels.append(label)

    return aligned_tokens, aligned_labels
def clean_annotations(annotations, labels_to_remove=["Years of Experience"]):
    cleaned = []
    for ann in annotations:
        if ann["label"] and ann["label"][0] not in labels_to_remove:
            cleaned.append(ann)
    return cleaned




In [33]:
resume_text = data[0]["content"]
annotations = clean_annotations(data[0].get("annotation", []))
aligned_tokens, aligned_labels = align_tokens_with_labels(resume_text, annotations, tokenizer)

# Optional: print first few aligned pairs
for token, label in zip(aligned_tokens[:30], aligned_labels[:30]):
    print(f"{token:15} → {label}")


A               → B-Name
##b             → I-Name
##his           → I-Name
##he            → I-Name
##k             → I-Name
J               → I-Name
##ha            → O
Application     → B-Designation
Development     → I-Designation
Associate       → O
-               → O
A               → B-Companies worked at
##cc            → I-Companies worked at
##ent           → I-Companies worked at
##ure           → O
Bengal          → B-Location
##uru           → O
,               → O
Karnataka       → O
-               → O
Em              → O
##ail           → O
me              → O
on              → O
Indeed          → B-Email Address
:               → I-Email Address
indeed          → I-Email Address
.               → I-Email Address
com             → I-Email Address
/               → I-Email Address


In [34]:
all_tokens = []
all_labels = []

for item in data:
    text = item["content"]
    annotations = clean_annotations(item.get("annotation", []))
    tokens, labels = align_tokens_with_labels(text, annotations, tokenizer)
    all_tokens.append(tokens)
    all_labels.append(labels)

print("Aligned resumes:", len(all_tokens))
print("Example tokens:", all_tokens[0][:10])
print("Example labels:", all_labels[0][:10])



Aligned resumes: 220
Example tokens: ['A', '##b', '##his', '##he', '##k', 'J', '##ha', 'Application', 'Development', 'Associate']
Example labels: ['B-Name', 'I-Name', 'I-Name', 'I-Name', 'I-Name', 'I-Name', 'O', 'B-Designation', 'I-Designation', 'O']


In [35]:
from collections import Counter

# Get unique labels from your aligned labels
all_unique_labels = set(label for seq in all_labels for label in seq)
label2id = {label: idx for idx, label in enumerate(sorted(all_unique_labels))}
id2label = {idx: label for label, idx in label2id.items()}

print("Label to ID mapping:", label2id)


Label to ID mapping: {'B-College Name': 0, 'B-Companies worked at': 1, 'B-Degree': 2, 'B-Designation': 3, 'B-Email Address': 4, 'B-Graduation Year': 5, 'B-Location': 6, 'B-Name': 7, 'B-Skills': 8, 'I-College Name': 9, 'I-Companies worked at': 10, 'I-Degree': 11, 'I-Designation': 12, 'I-Email Address': 13, 'I-Graduation Year': 14, 'I-Location': 15, 'I-Name': 16, 'I-Skills': 17, 'O': 18}


In [36]:
MAX_LEN = 128
input_ids_list = []
attention_masks_list = []
label_ids_list = []

for tokens, labels in zip(all_tokens, all_labels):
    # Convert tokens to input IDs
    encoding = tokenizer(tokens, is_split_into_words=True, padding='max_length',
                         truncation=True, max_length=MAX_LEN, return_tensors='pt')

    # Convert labels to IDs and pad
    label_ids = [label2id[label] for label in labels]
    label_ids = label_ids[:MAX_LEN] + [label2id['O']] * (MAX_LEN - len(label_ids))

    input_ids_list.append(encoding['input_ids'][0])
    attention_masks_list.append(encoding['attention_mask'][0])
    label_ids_list.append(torch.tensor(label_ids))
from torch.utils.data import TensorDataset

dataset = TensorDataset(
    torch.stack(input_ids_list),
    torch.stack(attention_masks_list),
    torch.stack(label_ids_list)
)

print("Dataset size:", len(dataset))


Dataset size: 220


In [38]:
from datasets import Dataset

# Convert to list of dicts
dataset_dicts = []
for input_ids, attention_mask, label_ids in zip(input_ids_list, attention_masks_list, label_ids_list):
    dataset_dicts.append({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": label_ids
    })

# Create Hugging Face Dataset
hf_dataset = Dataset.from_list(dataset_dicts)
hf_dataset = hf_dataset.train_test_split(test_size=0.2)
train_dataset = hf_dataset["train"]
eval_dataset = hf_dataset["test"]
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.8548,0.723421
2,0.7045,0.6616
3,0.6653,0.637141
4,0.6172,0.62967


TrainOutput(global_step=88, training_loss=0.7655076384544373, metrics={'train_runtime': 129.5204, 'train_samples_per_second': 5.435, 'train_steps_per_second': 0.679, 'total_flos': 45995297390592.0, 'train_loss': 0.7655076384544373, 'epoch': 4.0})

In [39]:
model.save_pretrained("ner_resume_model")
tokenizer.save_pretrained("ner_resume_model")


('ner_resume_model\\tokenizer_config.json',
 'ner_resume_model\\special_tokens_map.json',
 'ner_resume_model\\vocab.txt',
 'ner_resume_model\\added_tokens.json',
 'ner_resume_model\\tokenizer.json')

In [40]:
from transformers import BertForTokenClassification, AutoTokenizer

model = BertForTokenClassification.from_pretrained("ner_resume_model")
tokenizer = AutoTokenizer.from_pretrained("ner_resume_model")


In [41]:
import torch
from transformers import AutoTokenizer, BertForTokenClassification

def predict_entities(text, model, tokenizer, label_map):
    # Tokenize
    tokens_data = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True)
    input_ids = tokens_data["input_ids"]
    attention_mask = tokens_data["attention_mask"]
    offsets = tokens_data["offset_mapping"][0]

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    predicted_labels = [label_map[pred] for pred in predictions]

    # Extract named entities using BIO tags
    entities = []
    current_entity = {"label": None, "text": ""}
    for token, label, (start, end) in zip(tokens, predicted_labels, offsets):
        if token in ["[CLS]", "[SEP]", "[PAD]"]:
            continue
        if label.startswith("B-"):
            if current_entity["text"]:
                entities.append(current_entity)
            current_entity = {"label": label[2:], "text": text[start:end]}
        elif label.startswith("I-") and current_entity["label"] == label[2:]:
            current_entity["text"] += " " + text[start:end]
        else:
            if current_entity["text"]:
                entities.append(current_entity)
                current_entity = {"label": None, "text": ""}
    if current_entity["text"]:
        entities.append(current_entity)
    
    return entities


In [43]:
label_map = model.config.id2label  # Already maps like {0: 'B-Name', 1: 'I-Name', ...}


sample_resume = "Abhishek Jha Application Development Associate - Accenture Bangalore, Karnataka"
entities = predict_entities(sample_resume, model, tokenizer, label_map)

for ent in entities:
    print(f"{ent['label']}: {ent['text']}")


In [44]:
print("Predicted labels:", predicted_labels[:30])
print("Tokens:", tokens[:30])


Predicted labels: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Tokens: ['P', '##rade', '##ep', 'Kumar', 'Security', 'Ana', '##ly', '##st', 'in', 'In', '##fo', '##sy', '##s', '-', 'Career', 'Con', '##tour', 'Hyderabad', ',', 'Telangana', ',', 'Telangana', '-', 'Em', '##ail', 'me', 'on', 'Indeed', ':', 'indeed']


In [45]:
# Step 1: Print part of the first training resume
print("Sample from training data:\n")
print(data[0]['content'][:300])  # Preview the first 300 characters


Sample from training data:

Abhishek Jha
Application Development Associate - Accenture

Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a

• To work for an organization which provides me the opportunity to improve my skills
and knowledge for my individual and company's growth in best possibl


In [46]:
sample_resume = """
Abhishek Jha
Application Development Associate - Accenture

Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a

• To work for an organization which provides me the opportunity to improve my skills
and knowledge for my individual and company's growth in best possibl
"""

entities = predict_entities(sample_resume, model, tokenizer, label_map)

# Show predicted entities
print("\nPredicted Entities:")
for ent in entities:
    print(f"{ent['label']}: {ent['text']}")



Predicted Entities:


In [47]:
trainer.train()  # just rerun this


Epoch,Training Loss,Validation Loss
1,0.5946,0.617099
2,0.5544,0.645837
3,0.5245,0.652991
4,0.4988,0.65087


TrainOutput(global_step=88, training_loss=0.5347282967784188, metrics={'train_runtime': 160.4383, 'train_samples_per_second': 4.388, 'train_steps_per_second': 0.548, 'total_flos': 45995297390592.0, 'train_loss': 0.5347282967784188, 'epoch': 4.0})

In [49]:
model.save_pretrained("ner_resume_model_final")
tokenizer.save_pretrained("ner_resume_model_final")


('ner_resume_model_final\\tokenizer_config.json',
 'ner_resume_model_final\\special_tokens_map.json',
 'ner_resume_model_final\\vocab.txt',
 'ner_resume_model_final\\added_tokens.json',
 'ner_resume_model_final\\tokenizer.json')

In [50]:
from transformers import BertForTokenClassification, AutoTokenizer

# Load your retrained model and tokenizer
model = BertForTokenClassification.from_pretrained("ner_resume_model_final")
tokenizer = AutoTokenizer.from_pretrained("ner_resume_model_final")

# Get label mapping
label_map = model.config.id2label

# Resume sample for testing
sample_resume = """
Abhishek Jha
Application Development Associate - Accenture

Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a
"""

# Predict entities
entities = predict_entities(sample_resume, model, tokenizer, label_map)

# Show results
print("\nPredicted Entities:")
for ent in entities:
    print(f"{ent['label']}: {ent['text']}")



Predicted Entities:


In [51]:
from collections import Counter

flat_labels = [label for sublist in all_labels for label in sublist]
label_counts = Counter(flat_labels)

print("Label distribution in your training set:")
for label, count in label_counts.items():
    print(f"{label:20}: {count}")


Label distribution in your training set:
B-Name              : 221
I-Name              : 721
O                   : 81853
B-Designation       : 390
I-Designation       : 1121
B-Companies worked at: 293
I-Companies worked at: 834
B-Location          : 207
B-Email Address     : 202
I-Email Address     : 5642
B-College Name      : 156
I-College Name      : 784
B-Graduation Year   : 12
B-Skills            : 151
I-Skills            : 4863
B-Degree            : 132
I-Degree            : 521
I-Location          : 274
I-Graduation Year   : 5


In [52]:
import random

balanced_tokens = []
balanced_labels = []

for tokens, labels in zip(all_tokens, all_labels):
    new_tokens = []
    new_labels = []

    for token, label in zip(tokens, labels):
        if label != "O":
            new_tokens.append(token)
            new_labels.append(label)
        else:
            # Keep only 20% of 'O' tokens randomly
            if random.random() < 0.2:
                new_tokens.append(token)
                new_labels.append(label)

    if new_tokens:  # skip empty examples
        balanced_tokens.append(new_tokens)
        balanced_labels.append(new_labels)

print(f"Before balancing: {len(all_tokens)} samples")
print(f"After balancing:  {len(balanced_tokens)} samples")


Before balancing: 220 samples
After balancing:  220 samples


In [53]:
all_tokens = balanced_tokens
all_labels = balanced_labels
input_ids_list = []
attention_masks_list = []
label_ids_list = []

for tokens, labels in zip(all_tokens, all_labels):
    encoding = tokenizer(tokens, is_split_into_words=True, padding='max_length',
                         truncation=True, max_length=128, return_tensors='pt')
    label_ids = [label2id[label] for label in labels]
    label_ids = label_ids[:128] + [label2id['O']] * (128 - len(label_ids))

    input_ids_list.append(encoding['input_ids'][0])
    attention_masks_list.append(encoding['attention_mask'][0])
    label_ids_list.append(torch.tensor(label_ids))
from datasets import Dataset

dataset_dicts = []
for input_ids, attention_mask, label_ids in zip(input_ids_list, attention_masks_list, label_ids_list):
    dataset_dicts.append({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": label_ids
    })

hf_dataset = Dataset.from_list(dataset_dicts)
hf_dataset = hf_dataset.train_test_split(test_size=0.2)
train_dataset = hf_dataset["train"]
eval_dataset = hf_dataset["test"]
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./ner_resume_model_balanced",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

model.save_pretrained("ner_resume_model_balanced")



Epoch,Training Loss,Validation Loss
1,1.1194,1.037767
2,1.0609,0.95383
3,0.9709,0.962109
4,0.99,0.999167
5,0.8929,0.993549
6,0.7547,1.015952


In [54]:
# Save the tokenizer
tokenizer.save_pretrained("ner_resume_model_balanced")
from transformers import BertForTokenClassification, AutoTokenizer

model = BertForTokenClassification.from_pretrained("ner_resume_model_balanced")
tokenizer = AutoTokenizer.from_pretrained("ner_resume_model_balanced")
label_map = model.config.id2label
sample_resume = """
Abhishek Jha
Application Development Associate - Accenture

Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a
"""

entities = predict_entities(sample_resume, model, tokenizer, label_map)

print("\nPredicted Entities:")
for ent in entities:
    print(f"{ent['label']}: {ent['text']}")



Predicted Entities:


In [55]:
print("\nDEBUG: Showing first 30 predicted labels")
print(predicted_labels[:30])
print(tokens[:30])



DEBUG: Showing first 30 predicted labels
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['P', '##rade', '##ep', 'Kumar', 'Security', 'Ana', '##ly', 'In', '##fo', '##sy', '##s', '-', 'Career', 'Con', ',', '/', '##ep', 'Kumar', '/', '##55', '##d', '##9', '##8', 'Security', 'Ana', '##ly', '##tour', 'Security', 'is', ',']


In [77]:
%pip install spacy

import spacy
nlp = spacy.load("en_core_web_sm")
print("✅ spaCy is working perfectly now!")


Collecting spacyNote: you may need to restart the kernel to use updated packages.

  Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting blis<1.4.0,>=1.3.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Using cached blis-1.3.0-cp312-cp312-win_amd64.whl.metadata (7.6 kB)
Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.2.6-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting typer<1.0.0,>=0.3.0 (from spacy)
  Downloading typer-0.9.4-py3-none-any.whl.metadata (14 kB)
Using cached spacy-3.8.7-cp312-cp312-win_amd64.whl (13.9 MB)
Using cached thinc-8.3.6-cp312-cp312-win_amd64.whl (1.7 MB)
Using cached blis-1.3.0-cp312-cp312-win_amd64.whl (6.3 MB)
Using cached numpy-2.2.6-cp312-cp312-win_amd64.whl (12.6 MB)
Downloading typer-0.9.4-py3-none-any.whl (45 kB)
Installing collected packages: numpy, typer, blis, thinc, spacy

  Attempting uninstall: numpy

 

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
contourpy 1.2.0 requires numpy<2.0,>=1.20, but you have numpy 2.2.6 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.19.0 requires numpy<2.2.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.


✅ spaCy is working perfectly now!


In [78]:
import spacy
nlp = spacy.load("en_core_web_sm")
print("✅ SpaCy is finally working!")


✅ SpaCy is finally working!


In [80]:
import spacy
from spacy.pipeline import EntityRuler

# Load model
nlp = spacy.load("en_core_web_sm",exclude=["ner"])
# Add EntityRuler
ruler = nlp.add_pipe("entity_ruler")

# Define your patterns
patterns = [
    {"label": "Name", "pattern": [{"LOWER": "abhishek"}, {"LOWER": "jha"}]},
    {"label": "Designation", "pattern": "Application Development Associate"},
    {"label": "Company", "pattern": "Accenture"},
    {"label": "Location", "pattern": "Bengaluru"},
    {"label": "Email", "pattern": [{"TEXT": {"REGEX": r"indeed\.com/.+"}}]}
]

ruler.add_patterns(patterns)

# Sample test
doc = nlp("""
Abhishek Jha
Application Development Associate - Accenture
Bengaluru, Karnataka - Email me on Indeed: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a
""")

# Print results
for ent in doc.ents:
    print(f"{ent.label_}: {ent.text}")


Name: Abhishek Jha
Designation: Application Development Associate
Company: Accenture
Location: Bengaluru
Email: indeed.com/r/Abhishek-Jha/10e7a8cb732bc43a


In [81]:
from seqeval.metrics import classification_report, f1_score

y_true = [['B-Name', 'I-Name', 'O', 'B-Skills', 'O']]
y_pred = [['B-Name', 'I-Name', 'O', 'O', 'O']]

print(classification_report(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
import spacy
from spacy.scorer import Scorer
from spacy.training import Example

nlp = spacy.load("en_core_web_sm")  # or your rule-based model

# Define gold data
example = Example.from_dict(
    nlp.make_doc("Abhishek Jha works at Accenture."),
    {"entities": [(0, 13, "Name"), (23, 32, "Company")]}
)

scorer = Scorer()
scorer.score([example])


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

        Name       1.00      1.00      1.00         1
      Skills       0.00      0.00      0.00         1

   micro avg       1.00      0.50      0.67         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2

F1 Score: 0.6666666666666666




{'token_acc': 1.0,
 'token_p': 1.0,
 'token_r': 1.0,
 'token_f': 1.0,
 'sents_p': None,
 'sents_r': None,
 'sents_f': None,
 'tag_acc': None,
 'pos_acc': None,
 'morph_acc': None,
 'morph_micro_p': None,
 'morph_micro_r': None,
 'morph_micro_f': None,
 'morph_per_feat': None,
 'dep_uas': None,
 'dep_las': None,
 'dep_las_per_type': None,
 'ents_p': None,
 'ents_r': None,
 'ents_f': None,
 'ents_per_type': None,
 'cats_score': 0.0,
 'cats_score_desc': 'macro F',
 'cats_micro_p': 0.0,
 'cats_micro_r': 0.0,
 'cats_micro_f': 0.0,
 'cats_macro_p': 0.0,
 'cats_macro_r': 0.0,
 'cats_macro_f': 0.0,
 'cats_macro_auc': 0.0,
 'cats_f_per_type': {},
 'cats_auc_per_type': {}}