<a href="https://colab.research.google.com/github/Vilmo18/Fine-Tuning-LLMs/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
!pip install flair
!pip install datasets



In [21]:
!pip install sentencepiece



In [22]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [None]:
tagger = SequenceTagger.load('flair/ner-english-ontonotes-large')

In [None]:
included_tags = {
    "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOCATION", "NORP",
    "ORG", "PERSON", "PRODUCT", "WORK_OF_ART"
}

excluded_tags = {
    "CARDINAL", "DATE", "MONEY", "ORDINAL", "PERCENT", "QUANTITY", "TIME"
}

def anonymize_entities(text):
    sentence = Sentence(text)
    tagger.predict(sentence)

    token_replacements = {}

    for entity in sentence.get_spans('ner'):
        if entity.tag in included_tags:
            #Replace the entity with its tag
            token_replacements[(entity.start_position, entity.end_position)] = f"[{entity.tag}]"

    #Create anonymized text by replacing entities with their tags
    anonymized_text = ''
    last_idx = 0

    for (start, end), replacement in sorted(token_replacements.items()):
        anonymized_text += text[last_idx:start] + replacement
        last_idx = end

    anonymized_text += text[last_idx:]

    return anonymized_text


text = "Francesco Totti threatens to quit the Italian Players' Association . The Italy and Roma striker is upset over the early start to the new season . He says: We are the principal actors, but ... are never listened to."
anonymized_text = anonymize_entities(text)
print(anonymized_text)


In [None]:
from datasets import load_dataset
ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [None]:
ds

In [None]:
with open("article.txt", "w") as articles , open("highlights.txt", "w") as highlights:
    for it in ds["train"]:
        articles.write(it["article"] + "\n")
        highlights.write(it["highlights"] + "\n")

In [None]:
!pip install sentencepiece

In [None]:
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

In [None]:
import sentencepiece as spm

# train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
# spm.SentencePieceTrainer.train('--input=botchan.txt')
spm.SentencePieceTrainer.Train('--input=highlights.txt --model_prefix=m --vocab_size=100')
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# encode: text => id
print(sp.encode_as_pieces('[PERSON] threatens to quit [ORG] '))
print(sp.encode_as_ids('[PERSON] threatens to quit [ORG] '))

# decode: id => text
#print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
print(sp.decode_ids(sp.encode_as_ids('[PERSON] threatens to quit [ORG] ')))

In [None]:
# encode: text => id
print(sp.encode_as_pieces(anonymized_text))

In [None]:
anonymized_text