In [9]:
import requests
import spacy
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags

In [10]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
API_KEY = '706c3d2594104d618b00448e2ebec994'
url = f'https://newsapi.org/v2/top-headlines?country=us&apiKey={API_KEY}'

response = requests.get(url)
data = response.json()

if 'articles' not in data or len(data['articles']) == 0:
    raise ValueError("No articles found in the response.")

article = data['articles'][0]
article_content = article.get('content') or article.get('description') or article.get('title')

if not article_content:
    raise ValueError("No article content found or the content is empty. Article data: " + str(article))

print("News Article Content:")
print(article_content)

News Article Content:
Iran Arrests Dozens in Search for Suspects in Killing of Hamas Leader - The New York Times


In [12]:
def nltk_ner(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    tree = ne_chunk(pos_tags)
    iob_tagged = tree2conlltags(tree)
    entities = set()
    for word, pos, ner in iob_tagged:
        if ner != 'O':
            entities.add((word, ner))
    return entities

nltk_entities = nltk_ner(article_content)
print("\nEntities extracted using NLTK (Rule-based):")
print(nltk_entities)


Entities extracted using NLTK (Rule-based):
{('Suspects', 'B-ORGANIZATION'), ('Times', 'I-GPE'), ('York', 'I-GPE'), ('Iran', 'B-GPE'), ('New', 'B-GPE'), ('Dozens', 'B-GPE'), ('Leader', 'I-ORGANIZATION'), ('Search', 'B-GPE'), ('Hamas', 'B-ORGANIZATION')}


In [13]:
# ML-based NER using SpaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(article_content)

spacy_entities = [(ent.text, ent.label_) for ent in doc.ents]
print("\nEntities extracted using SpaCy (ML-based):")
print(spacy_entities)


Entities extracted using SpaCy (ML-based):
[('Iran', 'GPE'), ('Dozens', 'CARDINAL'), ('Search for Suspects', 'ORG'), ('Killing', 'GPE'), ('Hamas', 'ORG'), ('The New York Times', 'ORG')]


In [14]:
nltk_entity_set = set(nltk_entities)
spacy_entity_set = set(spacy_entities)

common_entities = nltk_entity_set.intersection(spacy_entity_set)
nltk_unique_entities = nltk_entity_set - spacy_entity_set
spacy_unique_entities = spacy_entity_set - nltk_entity_set

print("\nCommon Entities:")
print(common_entities)
print("\nUnique Entities to NLTK:")
print(nltk_unique_entities)
print("\nUnique Entities to SpaCy:")
print(spacy_unique_entities)


Common Entities:
set()

Unique Entities to NLTK:
{('Suspects', 'B-ORGANIZATION'), ('York', 'I-GPE'), ('Iran', 'B-GPE'), ('Hamas', 'B-ORGANIZATION'), ('New', 'B-GPE'), ('Dozens', 'B-GPE'), ('Leader', 'I-ORGANIZATION'), ('Search', 'B-GPE'), ('Times', 'I-GPE')}

Unique Entities to SpaCy:
{('Search for Suspects', 'ORG'), ('Iran', 'GPE'), ('Killing', 'GPE'), ('Dozens', 'CARDINAL'), ('The New York Times', 'ORG'), ('Hamas', 'ORG')}


In [15]:
print("\nDiscussion:")
print("The NLTK rule-based approach, using chunking and regular expressions, may not be as accurate as the SpaCy ML-based approach.")
print("SpaCy's model is trained on large datasets and can recognize a wider variety of entities and contexts.")
print("NLTK might miss some entities or incorrectly classify them, especially in more complex sentences.")
print("However, the rule-based approach can be useful in simpler contexts or where model-based approaches are not feasible.")


Discussion:
The NLTK rule-based approach, using chunking and regular expressions, may not be as accurate as the SpaCy ML-based approach.
SpaCy's model is trained on large datasets and can recognize a wider variety of entities and contexts.
NLTK might miss some entities or incorrectly classify them, especially in more complex sentences.
However, the rule-based approach can be useful in simpler contexts or where model-based approaches are not feasible.
