In [1]:
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
import nltk.data

c:\Users\Bilal\.conda\envs\textMining\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Users\Bilal\.conda\envs\textMining\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [None]:
def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)
    
    return match_list, string

In [None]:
def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'
        
    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = e_type
            for w in tmp_list[1:]:
                word_dict[w] = e_type
        else:
            word_dict[temp_str] = e_type
    return word_dict

In [None]:
def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", "'"]
    for i in text:
        if i in filters:
            text = text.replace(i, " " + i)
            
    return text

In [None]:
def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath , 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            text = clean(text)
            text_ = text        
            match_list = []
            for i in annotation:
                a, text_ = matcher(text, i[0])
                if a:
                    match_list.append((a[0][0], a[0][1], i[1]))
                    match_list.append((a[0][0], a[0][1], i[1]))

            d = mark_sentence(text, match_list)

            for i in d.keys():
                f.writelines(i + ' ' + d[i] +'\n')
            f.writelines('\n')

In [None]:
train_val_data = pd.DataFrame(columns=['text', 'annotation'])
test_data = pd.DataFrame(columns=['text', 'annotation'])

In [None]:
train_val_annotation = pd.read_json('train_val.json')
test_annotations = pd.read_json('test.json')

In [4]:
def extract_gt(df):

    ground_truth = []

    for id in df['id'].unique():
        latest_update = df[df['id'] == id]['updated_at'].max()
        gt = df[(df['id'] == id)&(df['updated_at'] == latest_update)]
        ground_truth.append(gt)

    return ground_truth

In [None]:
train_val_gt = extract_gt(train_val_annotation)
test_gt = extract_gt(test_annotations)

In [None]:
def framer(gt):
    data = pd.DataFrame(columns=['text', 'annotation'])
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    for line in gt:
        index = 0

        text = list(line['text'])[0]  # Assuming 'text' is a key in your dictionary
        
        for s in tokenizer.tokenize(text):
            sorted_line = sorted(list(line['label'])[0], key=lambda x: x['start'])
            sentence_entities = []
            
            for l in sorted_line:
                start = l['start']
                end = l['end']
                
                if start >= index and end <= len(s) + index:
                    entity_text = l['text']
                    entity_label = l['labels'][0]
                    sentence_entities.append((entity_text, entity_label))
            
            if sentence_entities:
                data = data.append({'text': s, 'annotation': sentence_entities}, ignore_index=True)
            
            index += len(s)
        
    return data


In [None]:
train_val_data = framer(train_val_gt)
test_data = framer(test_gt)

In [None]:
train_data = train_val_data[:int(len(train_val_data)*0.80)]
val_data = train_val_data[int(len(train_val_data)*0.80):]

In [None]:
train_filepath = 'flair_data/train.txt'
val_filepath = 'flair_data/val.txt'
test_filepath = 'flair_data/test.txt'

In [None]:
create_data(train_data, train_filepath)
create_data(val_data, val_filepath)
create_data(test_data, test_filepath)

In [2]:
import torch

device = torch.device('cuda')

In [3]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ColumnCorpus

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# define columns
columns = {0 : 'text', 1 : 'ner'}
# directory where the data resides
data_folder = 'flair_data/'
# initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              test_file = 'test.txt',
                              dev_file = 'val.txt')

embedding_types = [
    WordEmbeddings('glove'),  # You can add more embeddings if needed
]

# Create Stacked Embeddings
embeddings = StackedEmbeddings(embeddings=embedding_types)

# Create a SequenceTagger model
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    
model = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type)

# Create a ModelTrainer and fine-tune the model
trainer = ModelTrainer(model, corpus)

trainer.train(
    'flair_models',
    learning_rate=0.1,
    mini_batch_size=64,
    max_epochs=150,
)

In [None]:
from flair.data import Sentence

# Load your fine-tuned model
custom_ner_model = SequenceTagger.load(r'C:\Users\Bilal\Desktop\TUe\Y2\Q1\TM\flair_models\best-model.pt')

# Create a Sentence for NER
sentence = Sentence("Brandon Aguilera Zamora (born 28 June 2003) is a Costa Rican professional footballer who plays as a midfielder for Premier League club Nottingham Forest and the Costa Rica national team. Career. Club. In July 2022 Premier League side Nottingham Forest announced they had signed Aguilera on a four-year deal from Alajuelense, and would immediately be loaned to fellow Costa Rican side Guanacasteca for six months. In January 2023, Aguilera joined Primeira Liga club Estoril on loan until the end of the season. He played with the teams under 23 squad. International. A youth international for Costa Rica since 2018, Aguilera made his senior team debut against the United States on 30 March 2022. In November 2022 he was named to the 26-man Costa Rica squad for the 2022 FIFA World Cup. Honours. Alajuelense")

# Run NER on the sentence
custom_ner_model.predict(sentence)

# Access NER results
print(sentence.text)
print(sentence.labels)

In [5]:
from flair.data import Sentence

# Load your fine-tuned model
custom_ner_model = SequenceTagger.load(r'C:\Users\Bilal\Desktop\TUe\Y2\Q1\TM\flair_models\best-model.pt')

2023-11-09 20:00:15,402 SequenceTagger predicts: Dictionary with 10 tags: O, PLAYER, BIRTHDATE, COUNTRY, NATIONALITY, POSITION, CLUB, REFERENCE, <START>, <STOP>


In [10]:
import json


def output_to_json(s, output_file):
    output = dict()
    output['text'] = s.text
    result = []
    idx = 0
    for i in range(len(s.labels)):
        token, label = s.labels[i].labeled_identifier.split('/')
        token = token.replace('"', '')
        token = token.split(' ')[1]
        start = output['text'].find(token, idx)
        end = start + len(token)
        idx = end
        v = {'end':end, 'text':token, 'start':start, 'labels':[label]}
        result.append(v)
    output['label'] = result
    with open(output_file, 'w') as fp:
        json.dump(output, fp)

In [8]:
bench = pd.read_json(r'C:\Users\Bilal\Desktop\TUe\Y2\Q1\TM\c1_data\test_c1.json')
bench = extract_gt(bench)

In [9]:
for idx, _ in enumerate(bench):
    s = Sentence(bench[idx].text.iloc[0])
    custom_ner_model.predict(s)
    output_to_json(s, r'C:\Users\Bilal\Desktop\TUe\Y2\Q1\TM\c1_output\file'+str(idx))

In [None]:
import re

def augmented_text(sentence):
    outp = sentence.text
    used_text = []
    for l in sentence.labels:
        text_token = l.labeled_identifier
        text_token = text_token.split()
        text, token = text_token[1].split('/')
        text = text.replace('"', '')
        if text not in used_text:
            outp = outp.replace(text, f'[{token}]{text}[{token}]')
            outp = outp.replace(f'[{token}] [{token}]', ' ')
            used_text.append(text)
        else: 
            continue
    return outp