In [2]:
#NLTK
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag, ne_chunk
from nltk.tokenize import PunktSentenceTokenizer

In [3]:
#Install and import NLTK
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/anan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/anan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/anan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/anan/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [4]:
# Loading Dataset
with open("news article-1.txt", "r", encoding="utf-8") as f:
    text = f.read()

## 1. Detect sentences in the given news article

In [6]:
custom_sent_tokenizer = PunktSentenceTokenizer(text)
sentences = custom_sent_tokenizer.tokenize(text)

print("Sentences Detected")
for i, sent in enumerate(sentences, 1):
    print(f"{i}. {sent}")

Sentences Detected
1. Anglo-French Channel Tunnel operator Eurotunnel Monday announced a deal giving its creditor banks 45.5 percent of the company in return for wiping out one billion pounds ($1.56 billion) of its debt.
2. The long-awaited restructuring brings to an end months of wrangling between Eurotunnel and the 225 banks to which it owes nearly nine billion pounds ($14.1 billion).
3. The deal, announced simultaneously in Paris and London, brings the company back from the brink of insolvency but leaves shareholders owning only 54.5 percent of the company.
4. "The restructuring plan provides Eurotunnel with the medium-term financial stability to allow it to consolidate its substantial commercial achievements to date and to develop its operations," Eurotunnel co-chairman Alastair Morton said.
5. The firm was now making a profit before interest, he added.
6. Although shareholders will see their interests diluted, they were offered the prospect of a brighter future after months of unc

### 2. Perform Part-of-Speech (POS) on each sentence

In [8]:
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)

def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process_content()

[('Anglo-French', 'JJ'), ('Channel', 'NNP'), ('Tunnel', 'NNP'), ('operator', 'NN'), ('Eurotunnel', 'NNP'), ('Monday', 'NNP'), ('announced', 'VBD'), ('a', 'DT'), ('deal', 'NN'), ('giving', 'VBG'), ('its', 'PRP$'), ('creditor', 'NN'), ('banks', 'NNS'), ('45.5', 'CD'), ('percent', 'NN'), ('of', 'IN'), ('the', 'DT'), ('company', 'NN'), ('in', 'IN'), ('return', 'NN'), ('for', 'IN'), ('wiping', 'VBG'), ('out', 'RP'), ('one', 'CD'), ('billion', 'CD'), ('pounds', 'NNS'), ('(', '('), ('$', '$'), ('1.56', 'CD'), ('billion', 'CD'), (')', ')'), ('of', 'IN'), ('its', 'PRP$'), ('debt', 'NN'), ('.', '.')]
[('The', 'DT'), ('long-awaited', 'JJ'), ('restructuring', 'NN'), ('brings', 'NNS'), ('to', 'TO'), ('an', 'DT'), ('end', 'JJ'), ('months', 'NNS'), ('of', 'IN'), ('wrangling', 'VBG'), ('between', 'IN'), ('Eurotunnel', 'NNP'), ('and', 'CC'), ('the', 'DT'), ('225', 'CD'), ('banks', 'NNS'), ('to', 'TO'), ('which', 'WDT'), ('it', 'PRP'), ('owes', 'VBZ'), ('nearly', 'RB'), ('nine', 'CD'), ('billion', 'CD')

### 3. Find name entities including person’s name entities and locations

In [10]:
import nltk
nltk.download('maxent_ne_chunker')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/anan/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/anan/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/anan/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /Users/anan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
custom_sent_tokenizer = PunktSentenceTokenizer(text)
tokenized = custom_sent_tokenizer.tokenize(text)

def extract_person_and_locations():
    entities = []
    for i in tokenized:
        words = nltk.word_tokenize(i)
        tagged = nltk.pos_tag(words)
        tree = nltk.ne_chunk(tagged, binary=False)
        for subtree in tree.subtrees():
            if subtree.label() in ["PERSON", "GPE"]:   
                entity = " ".join([token for token, pos in subtree.leaves()])
                entities.append((subtree.label(), entity))
    return entities

results = extract_person_and_locations()

print("Extracted Named Entities:")
for label, entity in results:
    print(f"{label}: {entity}")

Extracted Named Entities:
GPE: Paris
GPE: London
PERSON: Alastair Morton
PERSON: Eurotunnel
GPE: Eurotunnel
GPE: European
GPE: French
PERSON: Patrick Ponsolle
GPE: Eurotunnel
GPE: London
PERSON: Eurotunnel
PERSON: Eurotunnel
GPE: Eurotunnel
