In [1]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [2]:
with open('input_data.txt','r', encoding='utf-8', errors='ignore') as f:
    text = f.read()

In [3]:
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, "|", ent.label_, "|", spacy.explain(ent.label_))

Gran | PERSON | People, including fictional
Russia | GPE | Countries, cities, states
Technopolis | ORG | Companies, agencies, institutions, etc.
Elcoteq | ORG | Companies, agencies, institutions, etc.
tens | CARDINAL | Numerals that do not fall under another type
Tallinn | ORG | Companies, agencies, institutions, etc.
Postimees | ORG | Companies, agencies, institutions, etc.
the years 2009-2012 | DATE | Absolute or relative dates or periods
Basware | NORP | Nationalities or religious or political groups
20 % -40 % | PERCENT | Percentage, including "%"
10 % | PERCENT | Percentage, including "%"
HDI | ORG | Companies, agencies, institutions, etc.
the last quarter of 2010 | DATE | Absolute or relative dates or periods
Componenta | ORG | Companies, agencies, institutions, etc.
EUR131 | PERSON | People, including fictional
EUR76 | ORG | Companies, agencies, institutions, etc.
the same period a year earlier | DATE | Absolute or relative dates or periods
zero | CARDINAL | Numerals that do not

In [4]:
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

In [5]:
# Set to store unique entity types
unique_entity_types = set()

# Collect unique entity types
for ent in doc.ents:
    unique_entity_types.add(ent.label_)

# Print unique entity types with their expansions
print("Unique Entity Types and Expansions:\n")
for entity_type in unique_entity_types:
    print(f"{entity_type}")

Unique Entity Types and Expansions:

GPE
FAC
PERCENT
CARDINAL
ORG
PRODUCT
MONEY
DATE
PERSON
QUANTITY
NORP


Using a website "https://tecoholic.github.io/ner-annotator/" to annotate custom entities.

In [6]:
import json

# Load the JSON data from file
file_path = 'annotations.json'  # Replace with your file path
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Initialize list to store TRAIN_DATA
TRAIN_DATA = []

# Iterate through each item in the JSON data
for item in data['annotations']:
    text = item[0].strip()  # Get the text and remove any trailing spaces
    entities = item[1]['entities']  # Get the list of annotations for the text

    # Convert entities to the required format
    formatted_entities = []
    for start, end, label in entities:
        formatted_entities.append((start, end, label))

    # Append to TRAIN_DATA
    TRAIN_DATA.append((text, {"entities": formatted_entities}))
    
# Now TRAIN_DATA is in the required format
for example in TRAIN_DATA:
    print(example)


('According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .', {'entities': [(13, 17, 'PERSON'), (71, 77, 'GPE')]})
('Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .', {'entities': [(0, 11, 'ORG'), (12, 38, 'STRATEGY'), (39, 84, 'FACILITY'), (123, 167, 'SECTOR'), (170, 188, 'STATEMENT')]})
('The international electronic industry company Elcoteq has laid off tens of employees from its Tallinn facility ; contrary to earlier layoffs the company contracted the ranks of its office workers , the daily Postimees reported .', {'entities': [(46, 53, 'ORG'), (67, 84, 'QUANTITY'), (94, 110, 'FACILITY'), (133, 140, 'EVENT'), (202, 217, 'PERSON')]})
('With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of ra

In [7]:
import random
from spacy.training import Example

# Custom labels based on spaCy's existing entity types
labels = ["DATE","PERSON","GPE", "PERCENT","CARDINAL","PRODUCT","QUANTITY","ORG","MONEY","STRATEGY",
          "FACILITY","SECTOR","STATEMENT","EVENT","OFFER"]

# Initialize a blank English model
nlp = spacy.blank('en')

# Add NER component to the pipeline
ner = nlp.add_pipe('ner')

# Add custom labels to the NER component
for label in labels:
    ner.add_label(label)


# Disable other pipeline components to only train NER
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for itn in range(20):  # Try Adjusting the number of iterations to 20
        # Shuffle training data for each iteration
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], losses=losses, sgd=optimizer)

In [8]:
# Save the trained model to disk
output_dir = 'spacy_custom_model_fin'
nlp.to_disk(output_dir)

print(f"Saved model to {output_dir}")

Saved model to spacy_custom_model_fin


In [9]:
model_path = 'spacy_custom_model_fin'

# Load the custom-trained model
nlp = spacy.load(model_path)

# Example text for prediction
# text = "Foundries division reports its sales increased by 9.7 % to EUR 63.1 mn from EUR 57.5 mn in the corresponding period in 2006 , and sales of the Machine Shop division increased by 16.4 % to EUR 41.2 mn from EUR 35.4 mn in the corresponding period in 2006 ."
# text = "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m ."
text = "MegaFon 's subscriber base increased 16.1 % in 2009 to 50.5 million users in 31 December, while its market share by the number of customers amounted to 24 % as of 2009 , up from 23 % as of 2008 , according to TeliaSonera estimates ."

# Process the text with the loaded model
doc = nlp(text)

# Display the entities predicted by the model
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Entity: 16.1 %, Label: PERCENT
Entity: in 2009, Label: DATE
Entity: 31 December, while, Label: PERCENT
Entity: customers, Label: PERSON
Entity: 24 %, Label: PERCENT
Entity: 23 %, Label: PERCENT
Entity: TeliaSonera estimates, Label: ORG


In [10]:
# Load the custom-trained model
model_path = 'spacy_custom_model_fin'  # Replace with the actual path to your trained model
nlp = spacy.load(model_path)

# Example evaluation data (replace with your own evaluation data)
eval_data = TRAIN_DATA

# Function to calculate evaluation metrics
def calculate_metrics(predicted_entities, true_entities):
    # Implement your logic here to calculate precision, recall, F1-score
    # Example calculation:
    correct = len(predicted_entities.intersection(true_entities))
    precision = correct / len(predicted_entities) if len(predicted_entities) > 0 else 0
    recall = correct / len(true_entities) if len(true_entities) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1_score

# Evaluate the model
total_precision, total_recall, total_f1_score = 0, 0, 0
total_documents = len(eval_data)

for text, annotations in eval_data:
    # Process the text with the loaded model
    doc = nlp(text)
    
    # Extract predicted entities
    predicted_entities = set((ent.start_char, ent.end_char, ent.label_) for ent in doc.ents)
    
    # Extract true entities from ground truth annotations
    true_entities = set((start, end, label) for start, end, label in annotations['entities'])
    
    # Calculate metrics for this document
    precision, recall, f1_score = calculate_metrics(predicted_entities, true_entities)
    
    # Accumulate metrics for total evaluation
    total_precision += precision
    total_recall += recall
    total_f1_score += f1_score
    
    # Optionally, print or log metrics for each document
    print(f"Text: {text}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1_score:.4f}")
    print()

# Calculate average metrics across all documents
avg_precision = total_precision / total_documents
avg_recall = total_recall / total_documents
avg_f1_score = total_f1_score / total_documents

# Print or log average metrics
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-score:", avg_f1_score)


Text: A purchase agreement for 7,200 tons of gasoline with delivery at the Hamina terminal , Finland , was signed with Neste Oil OYj at the average Platts index for this September plus eight US dollars per month .
Precision: 1.0000, Recall: 1.0000, F1-score: 1.0000

Text: Lifetree was founded in 2000 , and its revenues have risen on an average by 40 % with margins in late 30s .
Precision: 1.0000, Recall: 1.0000, F1-score: 1.0000

Text: Foundries division reports its sales increased by 9.7 % to EUR 63.1 mn from EUR 57.5 mn in the corresponding period in 2006 , and sales of the Machine Shop division increased by 16.4 % to EUR 41.2 mn from EUR 35.4 mn in the corresponding period in 2006 .
Precision: 0.9000, Recall: 1.0000, F1-score: 0.9474

Text: Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .
Precision: 1.0000, Recall: 1.0000, F1-score: 1.0000

Text: Consolidated net sales increased 16 % to reach EUR74 .8 m , while operating profit 