# SpaCy Model

### Importing necessary libraries

In [None]:
import pandas as pd
import spacy

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import string
from collections import Counter
from tqdm import tqdm
import re

from sklearn.model_selection import train_test_split
from fuzzywuzzy import fuzz

In [None]:
!python -m spacy download nl_core_news_lg
nltk.download('punkt')
nltk.download('stopwords')

### Load data and model

In [None]:
df = pd.read_csv('final_data.csv')

nlp = spacy.load("nl_core_news_lg")
ner_categories = ['ORG']

### Split into train and test set

In [None]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

print("Train set size:", len(train))
print("Test set size:", len(test))

### Prepare data

In [None]:
def get_spacy_format(truelabels):
    spacy_data = []

    for index, row in tqdm(truelabels.iterrows(), total=len(truelabels), desc="Creating formatted data"):
        text = row['Cleaned Text']
        org_name = row['True Organization']
        if pd.isnull(org_name):
            continue
        
        # escape special characters in the organization name
        escaped_org_name = re.escape(org_name)
        
        # pattern to find the organization name followed by a comma or period, or at the end of text
        pattern = rf"\b{escaped_org_name}\b"
        
        entities = []
        
        # find all matches 
        for match in re.finditer(pattern, text):
            start_index = match.start()
            end_index = match.end()
            
            # add the entity to the list
            entities.append((start_index, end_index, 'ORG'))
        
        spacy_data.append([text, {'entities': entities}])
    return spacy_data

In [None]:
# display the first few entries of the training data
train_data = get_spacy_format(train)
test_data = get_spacy_format(test)
print(test_data[:1])

In [None]:
# run this cell to see what words are marked as organizations
for data in train_data:
    text = data[0]
    entities = data[1]['entities']
    
    print("Organization Name Indices:")
    for entity in entities:
        start_index, end_index, entity_type = entity
        org_name = text[start_index:end_index]
        print(f"Organization Name: {org_name}, Start Index: {start_index}, End Index: {end_index}")
    print("\n")

## Pretrained Model

In [None]:
def extract_organization_name(text):
    # process text
    doc = nlp(text)
    # extract organization entities from the processed document
    organizations = [ent.text for ent in doc.ents if ent.label_ == 'ORG']  # extract organization entities
    if organizations:
        # find most common organization name by counting found organizations
        organization_counter = Counter(organizations)
        most_common_organization = organization_counter.most_common(1)[0][0]
        return most_common_organization
    else:
        return None

In [None]:
df_notfinetuned = test

df_notfinetuned['Predicted_Organization'] = df_notfinetuned['Cleaned Text'].apply(extract_organization_name)

### Evaluation

In [None]:
def calculate_pretrained_accuracy(df_notfinetuned):
    predicted_orgs = list(df_notfinetuned['Predicted_Organization'])
    true_orgs = list(df_notfinetuned['True Organization'])

    correct_predictions = 0
    for pred, truth in zip(predicted_orgs, true_orgs):
        # normalize the data to lower case to ignore case sensitivity
        pred = str(pred)
        truth = str(truth)
        pred = pred.lower().strip()
        truth = truth.lower().strip()

        # check for exact or partial match
        if pred != 'None':
            if pred == truth or pred in truth or truth in pred:
                print(pred, truth)
                correct_predictions += 1

    # calculate accuracy
    total_predictions = len(predicted_orgs)
    accuracy = correct_predictions / total_predictions
    return f"Accuracy: {accuracy * 100:.2f}%"

In [None]:
def calculate_pretrained_fuzzy_accuracy(df_notfinetuned):
    predicted_orgs = list(df_notfinetuned['Predicted_Organization'])
    true_orgs = list(df_notfinetuned['True Organization'])

    correct_predictions = 0
    threshold = 80 

    # evaluate with fuzzy matching
    for pred, truth in zip(predicted_orgs, true_orgs):
        pred = str(pred).lower().strip()
        truth = str(truth).lower().strip()

        if pred != 'none':
            match_score = fuzz.partial_ratio(pred, truth)
            if match_score >= threshold:
                correct_predictions += 1

    # calculate accuracy
    total_predictions = len(predicted_orgs)
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return f"Accuracy: {accuracy * 100:.2f}%"

In [None]:
calculate_pretrained_accuracy(df_notfinetuned)
calculate_pretrained_fuzzy_accuracy(df_notfinetuned)

### Prediction presence for precision and recall

In [None]:
def prediction_presence(df_notfinetuned):
    predicted_orgs = list(df_notfinetuned['Predicted_Organization'])
    prediction_presence = []

    for pred in predicted_orgs:
        pred = str(pred).lower().strip()
        
        if pred and pred != 'none':  # ensure that 'none' predictions are treated as no prediction
            prediction_presence.append(1)
        else:
            prediction_presence.append(0)

    return prediction_presence

spacy_prediction_presence = prediction_presence(df_notfinetuned)

In [None]:
# list with 1 if organization is present in the text, 0 otherwise
# see calculation of actuals in current_method notebook
actuals = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

spacy_pretr_preds = spacy_prediction_presence

In [None]:
def calculate_precision_recall(predictions, actuals):
    TP = sum(1 for actual, pred in zip(actuals, predictions) if actual == 1 and pred == 1)
    FP = sum(1 for actual, pred in zip(actuals, predictions) if actual == 0 and pred == 1)
    FN = sum(1 for actual, pred in zip(actuals, predictions) if actual == 1 and pred == 0)
    
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    return precision, recall

In [None]:
precision_s, recall_s = calculate_precision_recall(spacy_pretr_preds, actuals)

print(f"Precision SpaCy pretr: {precision_s:.2f}")
print(f"Recall SpaCy pretr: {recall_s:.2f}")

In [None]:
# run to see example of spacy labeling entities

# doc = nlp(df_notfinetuned['Cleaned Text'][0])

# spacy.displacy.render(doc, style='ent')

## Finetuned Model

### Prepare for training

In [None]:
import random
from spacy.training import Example
from spacy.util import minibatch

In [None]:
# use best hyperparameters from hyperparameter tuning
n_iter = 17
batch_size = 16
# learning_rate = 0.0001693938290758659
nlp = spacy.load("nl_core_news_lg")
ner_categories = ['ORG']

In [None]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
else:
    ner = nlp.get_pipe("ner")

In [None]:
optimizer = nlp.create_optimizer()

### Train model

In [None]:
# convert training data into Example objects
examples = []
for text, annotations in tqdm(train_data):
    examples.append(Example.from_dict(nlp.make_doc(text), annotations))

# train the NER model
losses = {}
for itn in tqdm(range(n_iter)): 
    random.shuffle(examples)
    for batch in minibatch(examples, size=batch_size):
        nlp.update(batch, losses=losses)
print(losses)

100%|██████████| 627/627 [00:05<00:00, 105.42it/s]
100%|██████████| 17/17 [1:01:28<00:00, 216.97s/it]

{'tok2vec': 0.0, 'morphologizer': 0.0, 'tagger': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 30600.62902485548}





## Evaluation of Results

### Run model on test data

In [None]:
def calculate_accuracy(test_data, nlp):
    total_examples = len(test_data)
    correct_predictions = 0

    for text, annotations in test_data:
        # process the text with the NER model to extract entities
        doc = nlp(text)
        
        # extract all predicted organizations from the document
        predicted_orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
        
        if 'entities' in annotations and annotations['entities']:
            start, end, label = annotations['entities'][0]
            true_org = text[start:end]

            # if there are predicted organizations, find the most common one
            if predicted_orgs:
                most_common_org = Counter(predicted_orgs).most_common(1)[0][0]
            else:
                most_common_org = None
            
            # compare the most common predicted organization with the true organization
            if most_common_org: 
                if most_common_org.lower() == true_org.lower():
                    correct_predictions += 1
                elif most_common_org.lower() in true_org.lower():
                    correct_predictions += 1
                elif true_org.lower() in most_common_org.lower():
                    correct_predictions += 1
        else:
            continue

    accuracy = correct_predictions / total_examples
    return accuracy

accuracy = calculate_accuracy(test_data, nlp)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 44.98%


In [None]:
def calculate_fuzzy_accuracy(test_data, nlp, threshold=80):
    total_examples = len(test_data)
    correct_predictions = 0

    for text, annotations in test_data:
        # process the text with the NER model to extract entities
        doc = nlp(text)
        
        # extract all predicted organizations from the document
        predicted_orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
        
        if 'entities' in annotations and annotations['entities']:
            start, end, label = annotations['entities'][0]
            true_org = text[start:end]

            # if there are predicted organizations, find the most common one
            if predicted_orgs:
                most_common_org = Counter(predicted_orgs).most_common(1)[0][0]
            else:
                most_common_org = None
            
            # compare the most common predicted organization with the true organization
            if most_common_org:
                # normalize to lower case
                most_common_org = most_common_org.lower().strip()
                true_org = true_org.lower().strip()
                
                # fuzzy matching score
                match_score = fuzz.partial_ratio(most_common_org, true_org)
                
                if match_score >= threshold:
                    correct_predictions += 1
        else:
            continue

    accuracy = correct_predictions / total_examples if total_examples > 0 else 0
    return accuracy

accuracy = calculate_fuzzy_accuracy(test_data, nlp, threshold=80)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 46.10%


### Precision and recall

### Get list of document level accuracy
For comparison between models with paired t-tests. See usage in current_method notebook.

In [None]:
total_examples = len(test_data)
doc_acc = []

print(total_examples)

for text, annotations in test_data:
    # Process the text with the NER model to extract entities
    doc = nlp(text)
    
    # Extract all predicted organizations from the document
    predicted_orgs = [ent.text for ent in doc.ents if ent.label_ == 'ORG']
    
    if 'entities' in annotations and annotations['entities']:
        # Find the true organization using the provided span (assumes there is only one true organization per text)
        start, end, label = annotations['entities'][0]
        true_org = text[start:end]

        # If there are predicted organizations, find the most common one
        if predicted_orgs:
            most_common_org = Counter(predicted_orgs).most_common(1)[0][0]
            if most_common_org.lower() == true_org.lower():
                doc_acc.append(1)
            elif most_common_org.lower() in true_org.lower():
                doc_acc.append(1)
            elif true_org.lower() in most_common_org.lower():
                doc_acc.append(1)
            else:
                doc_acc.append(0)
        else:
            doc_acc.append(0)
    else:
        doc_acc.append(0)


print(doc_acc)


## Second Dataset

In [None]:
seconddata = pd.read_csv('final_seconddata.csv')

#### Get labeled format

In [None]:
second_labeled = get_spacy_format(seconddata)

Creating formatted data: 100%|██████████| 458/458 [00:00<00:00, 4087.22it/s]


In [None]:
second_labeled = [entry for entry in second_labeled if len(entry[1]['entities']) > 0]

print(len(second_labeled))

222


### Try base model

In [None]:
df2_notfinetuned = seconddata

df2_notfinetuned['Predicted_Organization'] = df2_notfinetuned['Cleaned Text'].apply(extract_organization_name)

#### Evaluation

In [None]:
calculate_pretrained_accuracy(df_notfinetuned)
calculate_pretrained_fuzzy_accuracy(df_notfinetuned)

### Finetuned Model

In [None]:
calculate_accuracy(second_labeled, nlp)
calculate_fuzzy_accuracy(second_labeled, nlp, threshold=80)