In [1]:
import pandas as pd
from tqdm import tqdm
from difflib import SequenceMatcher
import re
import pickle
import nltk.data

c:\Users\Bilal\.conda\envs\textMining\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
c:\Users\Bilal\.conda\envs\textMining\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-246-g3d31191b-gcc_10_3_0.dll


In [None]:
def matcher(string, pattern):
    '''
    Return the start and end index of any pattern present in the text.
    '''
    match_list = []
    pattern = pattern.strip()
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
    if (match.size == len(pattern)):
        start = match.a
        end = match.a + match.size
        match_tup = (start, end)
        string = string.replace(pattern, "X" * len(pattern), 1)
        match_list.append(match_tup)
    
    return match_list, string

In [None]:
def mark_sentence(s, match_list):
    '''
    Marks all the entities in the sentence as per the BIO scheme. 
    '''
    word_dict = {}
    for word in s.split():
        word_dict[word] = 'O'
        
    for start, end, e_type in match_list:
        temp_str = s[start:end]
        tmp_list = temp_str.split()
        if len(tmp_list) > 1:
            word_dict[tmp_list[0]] = e_type
            for w in tmp_list[1:]:
                word_dict[w] = e_type
        else:
            word_dict[temp_str] = e_type
    return word_dict

In [None]:
def clean(text):
    '''
    Just a helper fuction to add a space before the punctuations for better tokenization
    '''
    filters = ["!", "#", "$", "%", "&", "(", ")", "/", "*", ".", ":", ";", "<", "=", ">", "?", "@", "[",
               "\\", "]", "_", "`", "{", "}", "~", "'"]
    for i in text:
        if i in filters:
            text = text.replace(i, " " + i)
            
    return text

In [None]:
def create_data(df, filepath):
    '''
    The function responsible for the creation of data in the said format.
    '''
    with open(filepath , 'w') as f:
        for text, annotation in zip(df.text, df.annotation):
            text = clean(text)
            text_ = text        
            match_list = []
            for i in annotation:
                a, text_ = matcher(text, i[0])
                if a:
                    match_list.append((a[0][0], a[0][1], i[1]))
                    match_list.append((a[0][0], a[0][1], i[1]))

            d = mark_sentence(text, match_list)

            for i in d.keys():
                f.writelines(i + ' ' + d[i] +'\n')
            f.writelines('\n')

In [None]:
train_val_data = pd.DataFrame(columns=['text', 'annotation'])
test_data = pd.DataFrame(columns=['text', 'annotation'])

In [None]:
train_val_annotation = pd.read_json('train_val.json')
test_annotations = pd.read_json('test.json')

In [None]:
def extract_gt(df):

    ground_truth = []

    for id in df['id'].unique():
        latest_update = df[df['id'] == id]['updated_at'].max()
        gt = df[(df['id'] == id)&(df['updated_at'] == latest_update)]
        ground_truth.append(gt)

    return ground_truth

In [None]:
train_val_gt = extract_gt(train_val_annotation)
test_gt = extract_gt(test_annotations)

In [None]:
def framer(gt):
    data = pd.DataFrame(columns=['text', 'annotation'])
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    for line in gt:
        index = 0

        text = list(line['text'])[0]  # Assuming 'text' is a key in your dictionary
        
        for s in tokenizer.tokenize(text):
            sorted_line = sorted(list(line['label'])[0], key=lambda x: x['start'])
            sentence_entities = []
            
            for l in sorted_line:
                start = l['start']
                end = l['end']
                
                if start >= index and end <= len(s) + index:
                    entity_text = l['text']
                    entity_label = l['labels'][0]
                    sentence_entities.append((entity_text, entity_label))
            
            if sentence_entities:
                data = data.append({'text': s, 'annotation': sentence_entities}, ignore_index=True)
            
            index += len(s)
        
    return data


In [None]:
train_val_data = framer(train_val_gt)
test_data = framer(test_gt)

In [None]:
train_data = train_val_data[:int(len(train_val_data)*0.80)]
val_data = train_val_data[int(len(train_val_data)*0.80):]

In [None]:
train_filepath = 'flair_data/train.txt'
val_filepath = 'flair_data/val.txt'
test_filepath = 'flair_data/test.txt'

In [None]:
create_data(train_data, train_filepath)
create_data(val_data, val_filepath)
create_data(test_data, test_filepath)

In [None]:
def remove_low_ratio(input_dir, input_file, ratio):
    text = []
    token = []
    with open(f'{input_dir}/{input_file}', 'r') as fp:
        output_file = f'{input_dir}/{input_file[:-4]}_balanced.txt'
        with open(output_file, 'a') as fp_b:
            for line in fp:
                text_token = line.strip().split()
                if not text_token:
                    null = token.count('O')
                    n_labels = len(token) - null
                    s_ratio = n_labels / len(token)
                    if s_ratio >= ratio:
                        for i, j in zip(text, token):
                            fp_b.write(f'{i} {j}\n')
                        fp_b.write('\n')
                    text = []
                    token = []
                else:
                    if len(text_token) == 2:
                        text.append(text_token[0])
                        token.append(text_token[1])
                    else:
                        continue

In [None]:
remove_low_ratio('flair_data', 'test.txt', 0.3)
remove_low_ratio('flair_data', 'val.txt', 0.3)
remove_low_ratio('flair_data', 'train.txt', 0.3)

In [None]:
import torch

device = torch.device('cuda')

In [2]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, StackedEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
from pathlib import Path
from flair.data import Corpus
from flair.datasets import ColumnCorpus

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# define columns
columns = {0 : 'text', 1 : 'ner'}
# directory where the data resides
data_folder = 'flair_data/'
# initializing the corpus
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file = 'train.txt',
                              test_file = 'test.txt',
                              dev_file = 'val.txt')

embedding_types = [
    WordEmbeddings('glove'),  # You can add more embeddings if needed
]

# Create Stacked Embeddings
embeddings = StackedEmbeddings(embeddings=embedding_types)

# Create a SequenceTagger model
tag_type = 'ner'
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    
model = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type)

# Create a ModelTrainer and fine-tune the model
trainer = ModelTrainer(model, corpus)

trainer.train(
    'flair_models',
    learning_rate=0.1,
    mini_batch_size=64,
    max_epochs=150,
)

In [3]:
from flair.data import Sentence

# Load your fine-tuned model
custom_ner_model = SequenceTagger.load(r'flair_models\best-model.pt')

# Create a Sentence for NER
sentence = Sentence("Brandon Aguilera Zamora (born 28 June 2003) is a Costa Rican professional footballer who plays as a midfielder for Premier League club Nottingham Forest and the Costa Rica national team. Career. Club. In July 2022 Premier League side Nottingham Forest announced they had signed Aguilera on a four-year deal from Alajuelense, and would immediately be loaned to fellow Costa Rican side Guanacasteca for six months. In January 2023, Aguilera joined Primeira Liga club Estoril on loan until the end of the season. He played with the teams under 23 squad. International. A youth international for Costa Rica since 2018, Aguilera made his senior team debut against the United States on 30 March 2022. In November 2022 he was named to the 26-man Costa Rica squad for the 2022 FIFA World Cup. Honours. Alajuelense")

# Run NER on the sentence
custom_ner_model.predict(sentence)

# Access NER results
print(sentence)

2023-10-16 16:06:50,107 SequenceTagger predicts: Dictionary with 10 tags: O, PLAYER, BIRTHDATE, COUNTRY, NATIONALITY, POSITION, CLUB, REFERENCE, <START>, <STOP>
Sentence[148]: "Brandon Aguilera Zamora (born 28 June 2003) is a Costa Rican professional footballer who plays as a midfielder for Premier League club Nottingham Forest and the Costa Rica national team. Career. Club. In July 2022 Premier League side Nottingham Forest announced they had signed Aguilera on a four-year deal from Alajuelense, and would immediately be loaned to fellow Costa Rican side Guanacasteca for six months. In January 2023, Aguilera joined Primeira Liga club Estoril on loan until the end of the season. He played with the teams under 23 squad. International. A youth international for Costa Rica since 2018, Aguilera made his senior team debut against the United States on 30 March 2022. In November 2022 he was named to the 26-man Costa Rica squad for the 2022 FIFA World Cup. Honours. Alajuelense" → ["Brandon"/PLA

In [29]:
import re

def augmented_text(sentence):
    outp = sentence.text
    used_text = []
    for l in sentence.labels:
        text_token = l.labeled_identifier
        text_token = text_token.split()
        text, token = text_token[1].split('/')
        text = text.replace('"', '')
        if text not in used_text:
            outp = outp.replace(text, f'[{token}]{text}[{token}]')
            outp = outp.replace(f'[{token}] [{token}]', ' ')
            used_text.append(text)
        else: 
            continue
    return outp

In [35]:
s = Sentence("Thomas Alun Lockyer (born 3 December 1994) is a Welsh professional footballer who plays as a centre-back for club Luton Town and the Wales national team. Club career. Bristol Rovers. Lockyer was born and raised in Cardiff. Lockyer started his career at Radyr Rangers. He joined Cardiff City as a youth aged 11, but was released aged 16, as Cardiff deemed him to be too small to play his preferred position of centre-back. Lockyer then signed a scholarship for Bristol Rovers in 2011 and went on to make his debut on 12 January 2013, replacing Ellis Harrison, 85 minutes into a 3–0 win over Fleetwood Town at Highbury Stadium. He made his home debut two months later on 12 March 2013, replacing Oliver Norburn in the 74th minute in a 2–0 win over Port Vale. He signed his first professional contract in May 2013, after making his previous appearances whilst still a youth team player. Lockyer scored his first league goal in a 1–0 win for Bristol Rovers over Northampton Town on 31 August 2013. On 17 March 2014, Lockyer signed a new two-year contract extension with Rovers. After Rovers were relegated out of League Two in the 2013–14 season, Lockyer became a very important part of a very successful season after seeing the club get promoted first time of asking. In that season he managed to score one goal vs Grimsby Town. As Rovers returned to League Two, he was a first-team regular and was rewarded by being named the Football League Young Player of the Month for December 2015 after consistent good performances. He made his 200th appearance for Rovers on 19 August 2017, in a 3–2 victory over Bury, in which he scored the opener. He departed Bristol Rovers following the expiration of his contract at the end of the 2018–19 season. Charlton Athletic. Lockyer joined Charlton Athletic, who were newly promoted to the Championship, on 28 June 2019 on a two-year contract. His first and only goal for the club was the equaliser in a 2–2 home draw with West Bromwich Albion on 11 January 2020. After playing every minute of 43 league appearances in the 2019–20 season and missing only three matches due to suspension, Lockyer triggered a relegation release clause in his contract in August 2020, allowing him to join another club for free. Luton Town. Lockyer signed for another Championship club, Luton Town, on 1 September 2020 on a free transfer. His debut came two weeks later in a 1–0 EFL Cup second round victory over Reading. In January 2022, Lockyer scored his first goal for the club with his side's first in a 2–1 win over Bristol City, saying after the match that he had dreamed of scoring against his old rivals. During the 2022–23 season, Lockyer continued to grow as a player, earning the captaincy following an injury to Sonny Bradley. As the season developed, he earned comparisons from his management team to legendary defenders Franco Baresi and Franz Beckenbauer. On 23 April 2023 Lockyer was named in the EFL Championship Team of the Season. At Luton's end of season awards, he took home five awards, most notably the Player of the Season award.&lt;ref name=\"22\/23 POTS\"&gt; &lt;\/ref&gt; On 16 May 2023, he scored Luton's all important second goal to defeat Sunderland and send the club to Wembley Stadium for the Championship play-off final.  Lockyer was in the starting line-up for the 2023 Championship play-off final against Coventry City, but was stretchered off the pitch after collapsing in the 11th minute of the first half and taken to hospital. During Luton's victory celebrations following the penalty shoot-out, the Luton players paraded a Lockyer named playing shirt as they received the trophy and the club posted a photo of Lockyer celebrating in his hospital bed. Luton’s manager Rob Edwards admitted after the game that it felt wrong to celebrate and his thoughts were primarily with Lockyer. On 31 May, it was confirmed by the club that he would be leaving hospital the following day, with a follow-up report a week later stating that Lockyer had suffered atrial fibrillation of the heart, but given the all clear to resume his playing career following surgery. On 6 July 2023, it was confirmed that Lockyer had renewed his contract with Luton Town, one week after his previous contract ran out. International career. In October 2015, Lockyer made his Wales under-21 debut in a 0–0 draw against Denmark. Lockyer was called up to the Welsh senior team in June 2017, remaining an unused substitute during a 1–1 draw with Serbia. He received his second call-up for the senior squad on 25 August 2017, for the upcoming qualifiers against Austria and Moldova. He made his debut for the senior team on 14 November 2017 as a half-time substitute during a 1–1 draw with Panama. In May 2021 he was selected for the Wales squad for the delayed UEFA Euro 2020 tournament. On 9 November 2022, more than a year since he last played for his country, Lockyer was called up to the Wales squad for the 2022 FIFA World Cup. Honours. Bristol Rovers Luton Town Individual")
custom_ner_model.predict(s)
print(augmented_text(s))

[PLAYER]Thomas Alun Lockyer[PLAYER] (born [BIRTHDATE]3 December 1994[BIRTHDATE]) is a [NATIONALITY]Welsh[NATIONALITY] professional footballer who plays as a [POSITION]centre-back[POSITION] for club [CLUB]Luton Town[CLUB] and t[REFERENCE]he[REFERENCE] [COUNTRY]Wales[COUNTRY] national team. Club career. [CLUB]Bristol Rovers[CLUB]. [PLAYER]Lockyer[PLAYER] was born and raised in [CLUB]Cardiff[CLUB]. [PLAYER]Lockyer[PLAYER] started [REFERENCE]his[REFERENCE] career at Radyr Rangers. [REFERENCE]He[REFERENCE] joined [CLUB]Cardiff City[CLUB] as a youth aged 11, but was released aged 16, as [CLUB]Cardiff[CLUB] deemed [REFERENCE]him[REFERENCE] to be too small to play [REFERENCE]his[REFERENCE] preferred position of [POSITION]centre-back[POSITION]. [PLAYER]Lockyer[PLAYER] t[REFERENCE]he[REFERENCE]n signed a scholarship for [CLUB]Bristol Rovers[CLUB] in 2011 and went on to make [REFERENCE]his[REFERENCE] debut on 12 January 201[BIRTHDATE]3[BIRTHDATE], replacing [PLAYER]Ellis Harrison[PLAYER], 85 minu