<a href="https://colab.research.google.com/github/NilofarMoradiFarisar/transformers/blob/main/NER_13_5_2024Trier_Uni.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rule-Based Systems

In [2]:
import re

In [3]:
def identify_entities(text):
    entities = []

    # Define patterns for each entity type
    person_pattern = re.compile(r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.|Miss)?\s[A-Z][a-z]+(?: [A-Z][a-z]+)?\b')
    location_pattern = re.compile(r'\b(?:in|at|from|to)\s[A-Z][a-z]+(?: [A-Z][a-z]+)?\b')
    organization_pattern = re.compile(r'\b(?:The|A|An)\s[A-Za-z]+\b')

    # Match patterns in the text and extract entities
    for match in person_pattern.finditer(text):
        entities.append((match.group(), "PERSON"))

    for match in location_pattern.finditer(text):
        entities.append((match.group(), "LOCATION"))

    for match in organization_pattern.finditer(text):
        entities.append((match.group(), "ORGANIZATION"))

    return entities

In [4]:
# Sample text for NER
text = "Barack Obama was born in Hawaii and became the President of the United States. He attended Harvard University and worked at the White House."

In [5]:
# Identify entities using the rule-based system
entities = identify_entities(text)

In [6]:
# Print identified entities and their types
for entity, entity_type in entities:
    print("Entity:", entity, "- Type:", entity_type)

Entity:  Obama - Type: PERSON
Entity:  Hawaii - Type: PERSON
Entity:  President - Type: PERSON
Entity:  United States - Type: PERSON
Entity:  Harvard University - Type: PERSON
Entity:  White House - Type: PERSON
Entity: in Hawaii - Type: LOCATION


In [1]:
import re

def identify_entities(text):
    entities = []

    # Define patterns for each entity type
    person_pattern = re.compile(r'\b(?:Mr\.|Ms\.|Mrs\.|Dr\.|Miss)?\s[A-Z][a-z]+(?: [A-Z][a-z]+)?\b')
    location_pattern = re.compile(r'\b(?:in|at|from|to)\s[A-Z][a-z]+(?: [A-Z][a-z]+)?\b')
    organization_pattern = re.compile(r'\b(?:The|A|An)\s[A-Za-z]+\b')

    # Match patterns in the text and extract entities
    for match in person_pattern.finditer(text):
        entities.append((match.group(), "PERSON"))

    for match in location_pattern.finditer(text):
        entities.append((match.group(), "LOCATION"))

    for match in organization_pattern.finditer(text):
        entities.append((match.group(), "ORGANIZATION"))

    return entities

# Sample text for NER
text = "Barack Obama was born in Hawaii and became the President of the United States. He attended Harvard University and worked at the White House."

# Identify entities using the rule-based system
entities = identify_entities(text)

# Print identified entities and their types
for entity, entity_type in entities:
    print("Entity:", entity, "- Type:", entity_type)


Entity:  Obama - Type: PERSON
Entity:  Hawaii - Type: PERSON
Entity:  President - Type: PERSON
Entity:  United States - Type: PERSON
Entity:  Harvard University - Type: PERSON
Entity:  White House - Type: PERSON
Entity: in Hawaii - Type: LOCATION


# Toolkit (NLTK) library

In [None]:
import nltk

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
from nltk.tag import pos_tag

In [None]:
from nltk.chunk import ne_chunk

In [None]:
# Sample text for NER
text = "Barack Obama was born in Hawaii and became the 44th President of the United States."

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Tokenize the text into words
words = word_tokenize(text)

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
# Perform part-of-speech tagging
tagged_words = pos_tag(words)

In [None]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [None]:
nltk.download('words')

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
# Perform named entity recognition
named_entities = ne_chunk(tagged_words)

In [None]:
# Print named entities
for entity in named_entities:
    if isinstance(entity, nltk.Tree):
        print("Entity:", " ".join([word for word, tag in entity.leaves()]), "- Type:", entity.label())

Entity: Barack - Type: PERSON
Entity: Obama - Type: PERSON
Entity: Hawaii - Type: GPE
Entity: United States - Type: GPE


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

nltk.download('punkt')
# Sample text for NER
text = "Barack Obama was born in Hawaii and became the 44th President of the United States."

# Tokenize the text into words
words = word_tokenize(text)

# Perform part-of-speech tagging
tagged_words = pos_tag(words)

# Perform named entity recognition
named_entities = ne_chunk(tagged_words)

# Print named entities
for entity in named_entities:
    if isinstance(entity, nltk.Tree):
        print("Entity:", " ".join([word for word, tag in entity.leaves()]), "- Type:", entity.label())

Entity: Barack - Type: PERSON
Entity: Obama - Type: PERSON
Entity: Hawaii - Type: GPE
Entity: United States - Type: GPE


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# NER with spaCy:

In [None]:
import spacy

In [None]:
# Load spaCy's pre-trained English NER model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Sample text for NER
text = "Barack Obama was born in Hawaii and became the 44th President of the United States."

In [None]:
# Process the text with spaCy NER model
doc = nlp(text)

In [None]:
# Print named entities and their labels
for ent in doc.ents:
    print("Entity:", ent.text, "- Type:", ent.label_)

Entity: Barack Obama - Type: PERSON
Entity: Hawaii - Type: GPE
Entity: 44th - Type: ORDINAL
Entity: the United States - Type: GPE


In [None]:
import spacy

# Load spaCy's pre-trained English NER model
nlp = spacy.load("en_core_web_sm")

# Sample text for NER
text = "Barack Obama was born in Hawaii and became the 44th President of the United States."

# Process the text with spaCy NER model
doc = nlp(text)

# Print named entities and their labels
for ent in doc.ents:
    print("Entity:", ent.text, "- Type:", ent.label_)

# NER with Hugging Face's Transformers using BERT

In [None]:
from transformers import pipeline

# Load BERT-based NER model from Hugging Face's Transformers
ner_pipeline = pipeline("ner", model="bert-base-uncased", tokenizer="bert-base-uncased")

# Sample text for NER
text = "Barack Obama was born in Hawaii and became the 44th President of the United States."

# Perform NER with BERT-based model
ner_results = ner_pipeline(text)

# Print named entities and their labels
for result in ner_results:
    print("Entity:", result["word"], "- Type:", result["entity"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Entity: barack - Type: LABEL_0
Entity: obama - Type: LABEL_0
Entity: was - Type: LABEL_0
Entity: born - Type: LABEL_0
Entity: in - Type: LABEL_0
Entity: hawaii - Type: LABEL_0
Entity: and - Type: LABEL_0
Entity: became - Type: LABEL_0
Entity: the - Type: LABEL_0
Entity: 44th - Type: LABEL_1
Entity: president - Type: LABEL_1
Entity: of - Type: LABEL_0
Entity: the - Type: LABEL_0
Entity: united - Type: LABEL_0
Entity: states - Type: LABEL_0
Entity: . - Type: LABEL_1


In [1]:
! pip install transformers



In [2]:
from transformers import AutoModel
checkpoint = "bert-base-uncased"
model = AutoModel.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  