In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import random
import torch

import pandas as pd
import spacy
from spacy.tokens import DocBin
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("data/ner_dataset.csv", encoding="latin1").ffill()
print(df.head())

    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1  Sentence: 1             of   IN   O
2  Sentence: 1  demonstrators  NNS   O
3  Sentence: 1           have  VBP   O
4  Sentence: 1        marched  VBN   O


In [5]:
# --------------------------
# STEP 2: Group Words by Sentence
# --------------------------
sentences = []
current_sentence = []
current_labels = []
prev_sent_id = df.iloc[0]["Sentence #"]

for _, row in df.iterrows():
    sent_id = row["Sentence #"]
    word = row["Word"]
    tag = row["Tag"]

    if sent_id != prev_sent_id:
        sentences.append((current_sentence, current_labels))
        current_sentence = []
        current_labels = []
        prev_sent_id = sent_id

    current_sentence.append(word)
    current_labels.append(tag)

# add last sentence
if current_sentence:
    sentences.append((current_sentence, current_labels))

print(f"Total sentences: {len(sentences)}")

Total sentences: 47959


In [6]:
# --------------------------
# STEP 3: Convert to spaCy Example Format
# --------------------------
nlp = spacy.blank("en")

def create_docs(sentences):
    docs = []
    for tokens, labels in sentences:
        doc = nlp.make_doc(" ".join(tokens))
        ents = []
        start = 0
        for token, label in zip(tokens, labels):
            end = start + len(token)
            if label != "O":
                label_type = label.split("-")[-1]  # B-PER → PER
                span = doc.char_span(start, end, label=label_type, alignment_mode="contract")
                if span:
                    ents.append(span)
            start = end + 1  # +1 for space
        doc.ents = ents
        docs.append(doc)
    return docs

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [7]:
# --------------------------
# STEP 4: Train/Val/Test Split
# --------------------------
train_sents, test_sents = train_test_split(sentences, test_size=0.2, random_state=42)
val_sents, test_sents = train_test_split(test_sents, test_size=0.5, random_state=42)

print(f"Train: {len(train_sents)}, Val: {len(val_sents)}, Test: {len(test_sents)}")

Train: 38367, Val: 4796, Test: 4796


In [12]:
# --------------------------
# STEP 5: Save to DocBin
# --------------------------
def save_to_docbin(sentences, path):
    docs = create_docs(sentences)
    db = DocBin(docs=docs)
    db.to_disk(path)

save_to_docbin(train_sents, "data/train.spacy")
save_to_docbin(val_sents, "data/val.spacy")
save_to_docbin(test_sents, "data/test.spacy")

print("✅ Saved train.spacy, val.spacy, test.spacy")

✅ Saved train.spacy, val.spacy, test.spacy


In [None]:
# pretty-print sample train data
print(create_docs([train_sents[0]])[0].to_json(indent=2))

The        O        
58         O        
-          O        
year       O        
-          O        
old        O        
former     O        
analyst    O        
says       O        
he         O        
provided   O        
information O        
to         O        
an         O        
official   O        
at         O        
the        O        
Israeli    B   gpe  
embassy    O        
and        O        
to         O        
two        O        
members    O        
of         O        
a          O        
lobbying   O        
group      O        
called     O        
the        O        
American   O        
Israel     B   geo  
Public     B   org  
Affairs    B   org  
Committee  B   org  
.          O        
Entities: [('Israeli', 'gpe'), ('Israel', 'geo'), ('Public', 'org'), ('Affairs', 'org'), ('Committee', 'org')]


In [1]:
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import random
import torch

# --------------------------
# STEP 1: Check GPU
# --------------------------
print("GPU available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# --------------------------
# STEP 2: Load Dataset (.spacy files from previous step)
# --------------------------
def load_docbin(path, nlp):
    db = DocBin().from_disk(path)
    return list(db.get_docs(nlp.vocab))

nlp = spacy.blank("en")

train_docs = load_docbin("data/train.spacy", nlp)
val_docs = load_docbin("data/val.spacy", nlp)

print(f"Loaded: {len(train_docs)} train docs, {len(val_docs)} val docs")

# --------------------------
# STEP 3: Build Pipeline with Transformer + NER
# --------------------------
# Add transformer (DistilBERT fits well in 8GB VRAM)
nlp.add_pipe("transformer", config={"model": {"name": "distilbert-base-uncased"}})
ner = nlp.add_pipe("ner", last=True)

# Add labels from training data
for doc in train_docs:
    for ent in doc.ents:
        ner.add_label(ent.label_)

# --------------------------
# STEP 4: Convert Docs to Examples
# --------------------------
train_examples = [Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in train_docs]
val_examples = [Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in val_docs]

# --------------------------
# STEP 5: Initialize & Train
# --------------------------
optimizer = nlp.initialize(lambda: train_examples)

n_iter = 1   # increase for full training (start small for testing)
for i in range(n_iter):
    random.shuffle(train_examples)
    losses = {}
    for batch in spacy.util.minibatch(train_examples, size=8):  # batch size fits 8GB VRAM
        nlp.update(batch, sgd=optimizer, losses=losses, drop=0.1)
    print(f"Iteration {i+1}/{n_iter}, Losses: {losses}")

# --------------------------
# STEP 6: Save Model
# --------------------------
output_dir = "./ner_model"
nlp.to_disk(output_dir)
print(f"✅ Model saved to {output_dir}")

# --------------------------
# STEP 7: Evaluate on Validation Set
# --------------------------
nlp2 = spacy.load(output_dir)

correct = 0
total = 0
for ex in val_examples:
    doc = nlp2(ex.text)
    pred = set([(ent.text, ent.label_) for ent in doc.ents])
    gold = set([(ent.text, ent.label_) for ent in ex.reference.ents])
    total += len(gold)
    correct += len(pred & gold)

print(f"Validation Precision: {correct/total:.2f} ({correct}/{total})")

# --------------------------
# STEP 8: Quick Test
# --------------------------
doc = nlp2("Elon Musk founded SpaceX in the USA.")
print("Entities:", [(ent.text, ent.label_) for ent in doc.ents])


GPU available: True
Using device: cuda


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Loaded: 38367 train docs, 4796 val docs


  _torch_pytree._register_pytree_node(


Iteration 1/1, Losses: {'transformer': 0.0, 'ner': 51975.42819693507}
✅ Model saved to ./ner_model
Validation Precision: 0.85 (13376/15722)
Entities: [('Elon', 'per'), ('Musk', 'org'), ('USA', 'org')]
