<a href="https://colab.research.google.com/github/V1PASH/PYTORCH/blob/main/NER_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy tqdm datasets

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import spacy
from datasets import load_dataset

In [None]:
from datasets import load_dataset
dataset =load_dataset("conll2003")

In [None]:
dataset

In [None]:
train_data=dataset["train"]
test_data=dataset["test"]
validation_data=dataset["validation"]

In [None]:
train_data

In [None]:
train_data[:3]

In [None]:
train_data.features["ner_tags"].feature.names

In [None]:
label_dict={
    "O":"O",
    "B-PER":"PER",
    "I-PER":"PER",
    "B-ORG":"ORG",
    "I-ORG":"ORG",
    "B-LOC":"GPE",
    "I-LOC":"GPE",
    "B-MISC":"MISC",
    "I-MISC":"MISC"
}

In [None]:
label_dict

In [None]:
from spacy.tokens import DocBin
from tqdm import tqdm

In [None]:
!python -m spacy link en_core_web_sm en


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
doc=nlp("Elon Musk Founded Tesla in America")

In [None]:
doc.ents

In [None]:
def convert_to_spacy(dataset_split):
    output_data = []
    for split in dataset_split:
        words = split["tokens"]
        labels = split["ner_tags"]
        entities = []
        text = " ".join(words)
        start = 0
        for i, word in enumerate(words):
            label = dataset["train"].features["ner_tags"].feature.int2str(labels[i])
            label = label_dict.get(label, "O")

            if label != "O":
                entity_start = text.find(word, start)
                entity_end = entity_start + len(word)
                entities.append((entity_start, entity_end, label))
                start = entity_end
        output_data.append((text, {"entities": entities}))
    return output_data


In [None]:
train_data=convert_to_spacy(train_data)
test_data=convert_to_spacy(test_data)
validation_data=convert_to_spacy(validation_data)

In [None]:
import json

with open("train_data.json", "w") as f:
    json.dump(train_data, f)

with open("validation_data.json", "w") as f:
    json.dump(validation_data, f)


In [None]:
print(train_data[:3])

In [None]:
for text, ann in train_data[:5]:
    print(text, ann)

In [None]:
nlp=spacy.blank("en")
ner=nlp.add_pipe("ner",last=True)

In [None]:
for _, annotations in train_data:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

In [None]:
ner.labels

In [None]:
print(nlp.pipe_names)

In [None]:
print(nlp.get_pipe("ner").labels)

In [None]:
nlp.initialize()
doc = nlp("Elon Musk founded SpaceX in 2002.")
print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
import random
from spacy.training.example import Example

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.resume_training()
    for itn in range(30):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.3, losses=losses)
        print(f"epoch {itn+1}, Loss: {losses}")

In [None]:
import pandas as pd
from spacy.training.example import Example
from spacy.scorer import Scorer

def evaluate_ner(nlp, examples):
    scorer = Scorer()
    example_objects = [Example.from_dict(nlp(text), annotations) for text, annotations in examples]
    scores = scorer.score(example_objects)
    precision = round(scores["ents_p"] * 100, 2)
    recall = round(scores["ents_r"] * 100, 2)
    f1_score = round(scores["ents_f"] * 100, 2)
    df = pd.DataFrame(
        {"Metric": ["Precision", "Recall", "F1-score"], "Score (%)": [precision, recall, f1_score]}
    )

    return df
results_df = evaluate_ner(nlp, test_data)

print(results_df)


In [None]:
results_df

In [None]:
nlp.to_disk("ner_model_n")

In [None]:
nlp = spacy.load("ner_model_n")

In [None]:
doc = nlp("Elon Musk founded SpaceX in 2002.")
print([(ent.text, ent.label_) for ent in doc.ents])

In [None]:
from spacy import displacy


text = "Microsoft was founded by Bill Gates"
doc = nlp(text)

displacy.render(doc,style="ent",jupyter=True)


In [None]:
text = "Microsoft was founded by Bill Gates and Apple was founded by Steve Jobs"
doc = nlp(text)

displacy.render(doc,style="ent",jupyter=True)

In [None]:
import spacy
from spacy.training import Example
from spacy.scorer import Scorer

def evaluate_model(nlp, dataset):
    scorer = Scorer()
    examples = [Example.from_dict(nlp.make_doc(text), ann) for text, ann in dataset]
    scores = scorer.score(examples)
    return scores

In [None]:
import random
from spacy.training.example import Example

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.create_optimizer()
    for itn in range(2):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.3, losses=losses)
        print(f"epoch {itn+1}, Loss: {losses}")

In [None]:
true_labels = []
for text, annotations in test_data:
    entity_labels = ["O"] * len(text.split())
    for start, end, label in annotations["entities"]:
        word_idx = text[:start].count(" ")
        entity_labels[word_idx] = label
    true_labels.append(entity_labels)

In [None]:
pred_labels = []
for text, _ in test_data:
    doc = nlp(text)
    entity_labels = ["O"] * len(text.split())
    for ent in doc.ents:
        word_idx = text[:ent.start_char].count(" ")
        entity_labels[word_idx] = ent.label_
    pred_labels.append(entity_labels)


In [None]:
import itertools
from sklearn.metrics import classification_report

true_labels_flat = list(itertools.chain(*true_labels))
pred_labels_flat = list(itertools.chain(*pred_labels))

print(classification_report(true_labels_flat, pred_labels_flat))


In [None]:
print("Dataset Labels:", set(itertools.chain(*true_labels)))
print("Model Predictions:", set(itertools.chain(*pred_labels)))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

labels = list(set(true_labels_flat))

cm = confusion_matrix(true_labels_flat, pred_labels_flat, labels=labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", xticklabels=labels, yticklabels=labels, cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("NER Model Confusion Matrix")
plt.show()


In [None]:
import collections

entity_counts = collections.Counter(true_labels_flat)

plt.figure(figsize=(8, 5))
plt.bar(entity_counts.keys(), entity_counts.values(), color="skyblue")
plt.xlabel("Entity Type")
plt.ylabel("Frequency")
plt.title("Entity Distribution in Dataset")
plt.xticks(rotation=45)
plt.show()


In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

target_class = "PERSON"

true_binary = [1 if label == target_class else 0 for label in true_labels_flat]
pred_binary = [1 if label == target_class else 0 for label in pred_labels_flat]

precision, recall, _ = precision_recall_curve(true_binary, pred_binary)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', label=f'Precision-Recall Curve for {target_class}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision-Recall Curve for {target_class}")
plt.legend()
plt.show()
