In [7]:
import spacy
import pandas as pd
import json
import re
from spacy import displacy


In [8]:
# Load the pre-trained NER model
nlp = spacy.load("en_core_web_sm")




In [9]:
# Sample text data
texts = [
    """
    TechCorp, a leading tech company based in New York, announced on Monday that CEO Jane Doe plans to introduce a new AI-powered smartphone.
    The device, named 'Nexus X,' will debut in Paris next month. Meanwhile, critics like Dr. Mark Smith argue that the launch could impact environmental policies.
    The company also partnered with HealthOrg, a nonprofit, to promote digital wellness. Apple declined to comment on the news.
    The event will coincide with the Global Tech Summit 2024.
    """,

    """
    Lionel Messi, the Argentine footballer, signed a $20 million contract with Miami FC on July 15, 2023.
    The deal was announced during a press conference at Hard Rock Stadium in Florida.
    Fans from across South America flooded social media to celebrate the move.
    """,

    """
    President John Harper met with German Chancellor Angela Weber in Berlin last Friday to discuss NATO policies.
    The United Nations will host a climate summit in Geneva, Switzerland, in December 2025.
    Critics warn that the new tax law (HB 1420) might face delays in Congress.
    """,

    """
    Marie Curie, born in Warsaw in 1867, discovered radium and won the Nobel Prize in Chemistry in 1911.
    In "The Great Gatsby," Jay Gatsby hosts lavish parties in West Egg, New York, reflecting the excesses of the Jazz Age.
    A recent study in Nature Journal links sleep deprivation to decreased cognitive performance.
    """
]

# Data Preparation

In [10]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9.,\s]', '', text)  # Remove special characters except common punctuation
    return text

In [11]:
# Apply preprocessing
texts = [preprocess_text(text) for text in texts]


# NER Implementation

In [12]:
# Function to extract named entities
def extract_entities(texts):
    entities_list = []
    for text in texts:
        doc = nlp(text)
        for ent in doc.ents:
            entities_list.append({"Entity": ent.text, "Label": ent.label_})
    return entities_list

In [13]:
# Extract entities
extracted_entities = extract_entities(texts)


# Visualization & Evaluation

In [14]:
# Convert to DataFrame
entities_df = pd.DataFrame(extracted_entities)


In [15]:
# Display entities visually for the first sample text
displacy.render(nlp(texts[0]), style="ent", jupyter=True)

# NER Output

In [16]:
# Save to CSV
entities_df.to_csv("ner_output.csv", index=False)

# Save to JSON
with open("ner_output.json", "w") as json_file:
    json.dump(extracted_entities, json_file, indent=4)

In [19]:
# Save annotated text as an HTML file
html_file = "annotated_sample_text.html"

# Render the visualization to a string
html_output = displacy.render(nlp(texts[0]), style="ent", page=True, jupyter=False)

# Write the HTML output to a file
with open(html_file, "w") as file:
    file.write(html_output)

print(f"Annotated sample text saved as {html_file}")


Annotated sample text saved as annotated_sample_text.html
