In [1]:
# STEP 1: Install Dependencies

!pip install -U spacy pandas scikit-learn tqdm
!python -m spacy download en_core_web_sm

Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (11 kB)
Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m131.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.8.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m134.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, pandas
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
 

In [2]:
# STEP 2: Import Libraries

import pandas as pd
import spacy
from spacy.training import Example
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
# STEP 3: Load Dataset

df = pd.read_csv("NER_dataset.csv", encoding='latin1')

# Expected columns:
# Sentence | Entity | Label


In [4]:
# STEP 4: Convert to spaCy Format

training_data = []
current_sentence_words = []
current_sentence_tags = []

# Using tqdm to show progress
for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing dataset to spaCy format"):
    word = str(row['Word'])
    ner_tag = str(row['Tag'])  # e.g., 'O', 'B-PER', 'I-PER'

    # Check if a new sentence starts
    if pd.notna(row['Sentence #']):
        # Process the previous sentence if it exists
        if current_sentence_words:
            full_text = " ".join(current_sentence_words)
            entities = []
            current_entity_start_char = -1
            current_entity_label = None
            word_start_char = 0  # Tracks character offset in full_text

            for word_idx, (w, tag) in enumerate(zip(current_sentence_words, current_sentence_tags)):
                if tag.startswith('B-'):
                    # If an entity was ongoing, close it before starting a new one
                    if current_entity_start_char != -1:
                        entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))
                    current_entity_start_char = word_start_char
                    current_entity_label = tag[2:]  # Extract label, e.g., 'PER' from 'B-PER'
                elif tag.startswith('I-'):
                    # Continue entity, but handle potential errors in IOB sequencing
                    # If no B-tag preceded or label mismatch, treat as new B-tag for robustness
                    if current_entity_start_char == -1 or current_entity_label != tag[2:]:
                        if current_entity_start_char != -1:
                            entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))
                        current_entity_start_char = word_start_char
                        current_entity_label = tag[2:]
                elif tag == 'O':
                    if current_entity_start_char != -1:  # Entity just ended
                        entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))
                    current_entity_start_char = -1  # Reset
                    current_entity_label = None

                word_start_char += len(w) + 1  # +1 for the space after the word

            # After processing all words in a sentence, check if an entity was still ongoing
            if current_entity_start_char != -1:
                entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))

            training_data.append((full_text, {"entities": entities}))

        # Reset for the new sentence
        current_sentence_words = [word]
        current_sentence_tags = [ner_tag]
    else:
        # Continue adding words and tags to the current sentence
        current_sentence_words.append(word)
        current_sentence_tags.append(ner_tag)

# After the loop, process the very last sentence
if current_sentence_words:
    full_text = " ".join(current_sentence_words)
    entities = []
    current_entity_start_char = -1
    current_entity_label = None
    word_start_char = 0

    for word_idx, (w, tag) in enumerate(zip(current_sentence_words, current_sentence_tags)):
        if tag.startswith('B-'):
            if current_entity_start_char != -1:
                entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))
            current_entity_start_char = word_start_char
            current_entity_label = tag[2:]
        elif tag.startswith('I-'):
            if current_entity_start_char == -1 or current_entity_label != tag[2:]:
                if current_entity_start_char != -1:
                    entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))
                current_entity_start_char = word_start_char
                current_entity_label = tag[2:]
        elif tag == 'O':
            if current_entity_start_char != -1:
                entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))
            current_entity_start_char = -1
            current_entity_label = None
        word_start_char += len(w) + 1

    if current_entity_start_char != -1:
        entities.append((current_entity_start_char, word_start_char - 1, current_entity_label))

    training_data.append((full_text, {"entities": entities}))


Processing dataset to spaCy format: 100%|██████████| 2307/2307 [00:00<00:00, 28834.43it/s]


In [5]:
# STEP 5: Train-Test Split

train_data, test_data = train_test_split(
    training_data, test_size=0.2, random_state=42
)

In [6]:
# STEP 6: Load spaCy Model

nlp = spacy.load("en_core_web_sm")
ner = nlp.get_pipe("ner")

In [7]:
# STEP 7: Add Labels

for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

In [8]:
# STEP 8: Train Model

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()

    for epoch in range(20):
        losses = {}
        for text, annotations in tqdm(train_data):
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Epoch {epoch+1}, Loss:", losses)

100%|██████████| 80/80 [00:01<00:00, 47.50it/s]


Epoch 1, Loss: {'ner': np.float32(286.62292)}


100%|██████████| 80/80 [00:01<00:00, 48.91it/s]


Epoch 2, Loss: {'ner': np.float32(187.12917)}


100%|██████████| 80/80 [00:01<00:00, 50.48it/s]


Epoch 3, Loss: {'ner': np.float32(282.95312)}


100%|██████████| 80/80 [00:02<00:00, 37.75it/s]


Epoch 4, Loss: {'ner': np.float32(164.0441)}


100%|██████████| 80/80 [00:01<00:00, 43.85it/s]


Epoch 5, Loss: {'ner': np.float32(140.36494)}


100%|██████████| 80/80 [00:02<00:00, 29.19it/s]


Epoch 6, Loss: {'ner': np.float32(92.44096)}


100%|██████████| 80/80 [00:02<00:00, 37.54it/s]


Epoch 7, Loss: {'ner': np.float32(96.154396)}


100%|██████████| 80/80 [00:01<00:00, 50.01it/s]


Epoch 8, Loss: {'ner': np.float32(90.816246)}


100%|██████████| 80/80 [00:01<00:00, 44.94it/s]


Epoch 9, Loss: {'ner': np.float32(69.128586)}


100%|██████████| 80/80 [00:01<00:00, 45.41it/s]


Epoch 10, Loss: {'ner': np.float32(66.30458)}


100%|██████████| 80/80 [00:01<00:00, 51.16it/s]


Epoch 11, Loss: {'ner': np.float32(52.829304)}


100%|██████████| 80/80 [00:01<00:00, 51.09it/s]


Epoch 12, Loss: {'ner': np.float32(43.43938)}


100%|██████████| 80/80 [00:01<00:00, 50.78it/s]


Epoch 13, Loss: {'ner': np.float32(52.92191)}


100%|██████████| 80/80 [00:01<00:00, 50.95it/s]


Epoch 14, Loss: {'ner': np.float32(47.334465)}


100%|██████████| 80/80 [00:01<00:00, 49.67it/s]


Epoch 15, Loss: {'ner': np.float32(34.687294)}


100%|██████████| 80/80 [00:01<00:00, 44.05it/s]


Epoch 16, Loss: {'ner': np.float32(35.89217)}


100%|██████████| 80/80 [00:01<00:00, 46.82it/s]


Epoch 17, Loss: {'ner': np.float32(25.828316)}


100%|██████████| 80/80 [00:01<00:00, 50.84it/s]


Epoch 18, Loss: {'ner': np.float32(41.544777)}


100%|██████████| 80/80 [00:01<00:00, 50.62it/s]


Epoch 19, Loss: {'ner': np.float32(29.973618)}


100%|██████████| 80/80 [00:01<00:00, 50.68it/s]

Epoch 20, Loss: {'ner': np.float32(20.888296)}





In [9]:
# STEP 9: Save Model

nlp.to_disk("/content/ner_model")

In [10]:
# STEP 10: Load Model for Testing

nlp_test = spacy.load("/content/ner_model")

In [11]:
# STEP 11: Test on New Sentence

text = "Microsoft was founded by Bill Gates in USA"
doc = nlp_test(text)

print("\nPredicted Entities:")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


Predicted Entities:
Microsoft -> org
Bill Gates in USA -> org


In [13]:
# STEP 12: Evaluate on Test Data

correct = 0
total = 0

for text, annotations in test_data:
    doc = nlp_test(text)
    predicted = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    actual = annotations["entities"]

    for ent in actual:
        total += 1
        if ent in predicted:
            correct += 1

accuracy = correct / total if total > 0 else 0
print("\nTest Accuracy:", accuracy)


Test Accuracy: 0.8888888888888888
