<a href="https://colab.research.google.com/github/Vilmo18/Fine-Tuning-LLMs/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flair
!pip install datasets
!pip install sentencepiece

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger
from datasets import load_dataset
import sentencepiece as spm


In [None]:
tagger = SequenceTagger.load('flair/ner-english-ontonotes-large')

In [None]:
included_tags = {
    "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOCATION", "NORP",
    "ORG", "PERSON", "PRODUCT", "WORK_OF_ART"
}

excluded_tags = {
    "CARDINAL", "DATE", "MONEY", "ORDINAL", "PERCENT", "QUANTITY", "TIME"
}

def anonymize_entities(text):
    sentence = Sentence(text)
    tagger.predict(sentence)

    token_replacements = {}

    for entity in sentence.get_spans('ner'):
        if entity.tag in included_tags:
            #Replace the entity with its tag
            token_replacements[(entity.start_position, entity.end_position)] = f"[{entity.tag}]"

    #Create anonymized text by replacing entities with their tags
    anonymized_text = ''
    last_idx = 0

    for (start, end), replacement in sorted(token_replacements.items()):
        anonymized_text += text[last_idx:start] + replacement
        last_idx = end

    anonymized_text += text[last_idx:]

    return anonymized_text


text = "Francesco Totti threatens to quit the Italian Players' Association . The Italy and Roma striker is upset over the early start to the new season . He says: We are the principal actors, but ... are never listened to."
anonymized_text = anonymize_entities(text)
print(anonymized_text)


[PERSON] threatens to quit [ORG] . The [GPE] and [GPE] striker is upset over the early start to the new season . He says: We are the principal actors, but ... are never listened to.


In [None]:

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")

In [None]:
ds

In [None]:
with open("article.txt", "w") as articles , open("highlights.txt", "w") as highlights:
    for it in ds["train"]:
        articles.write(it["article"] + "\n")
        highlights.write(it["highlights"] + "\n")

In [None]:
!wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt

In [None]:
# "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOCATION", "NORP",
    # "ORG", "PERSON", "PRODUCT", "WORK_OF_ART"

spm.SentencePieceTrainer.Train('--input=highlights.txt --model_prefix=m   --user_defined_symbols=[PERSON],[ORG],[GPE]')
# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
sp.load('m.model')

# encode: text => id
#text='[PERSON] threatens to quit [ORG] '
text='[PERSON] threatens to quit [ORG]. The [GPE] and [GPE] striker is upset over the early start to the new season . He says: We are the principal actors, but ... are never listened to.'
print(sp.encode_as_pieces(text))
print(sp.encode_as_ids(text))

# decode: id => text
#print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
print(sp.decode_ids(sp.encode_as_ids(text)))

['▁', '[PERSON]', '▁threat', 'ens', '▁to', '▁quit', '▁', '[ORG]', '.', '▁The', '▁', '[GPE]', '▁and', '▁', '[GPE]', '▁striker', '▁is', '▁upset', '▁over', '▁the', '▁early', '▁start', '▁to', '▁the', '▁new', '▁season', '▁.', '▁He', '▁says', ':', '▁We', '▁are', '▁the', '▁principal', '▁actor', 's', ',', '▁but', '▁...', '▁are', '▁never', '▁listen', 'ed', '▁to', '.']
[16, 3, 1406, 1341, 10, 2508, 16, 4, 32, 24, 16, 5, 15, 16, 5, 948, 23, 4084, 77, 8, 704, 625, 10, 8, 104, 325, 6, 52, 36, 49, 671, 44, 8, 6686, 1900, 7, 9, 67, 3396, 44, 618, 4851, 20, 10, 32]
[PERSON] threatens to quit [ORG]. The [GPE] and [GPE] striker is upset over the early start to the new season . He says: We are the principal actors, but ... are never listened to.


In [None]:
# encode: text => id
print(sp.encode_as_pieces(anonymized_text))

['▁', '[PERSON]', '▁threatens', '▁to', '▁quit', '▁', '[ORG]', '▁.', '▁The', '▁', '[GPE]', '▁and', '▁', '[GPE]', '▁striker', '▁is', '▁upset', '▁over', '▁the', '▁early', '▁start', '▁to', '▁the', '▁new', '▁season', '▁.', '▁He', '▁says', ':', '▁We', '▁are', '▁the', '▁principal', '▁actors', ',', '▁but', '▁...', '▁are', '▁never', '▁listened', '▁to', '.']


In [None]:
anonymized_text

'[PERSON] threatens to quit [ORG] . The [GPE] and [GPE] striker is upset over the early start to the new season . He says: We are the principal actors, but ... are never listened to.'