In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [6]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [7]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [8]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [9]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [10]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [11]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [12]:
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

NameError: name 'ne_chunk' is not defined

In [13]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [14]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [15]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [16]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://en.wikipedia.org/wiki/Muhammad_Ayub_Khan')
article = nlp(ny_bb)
len(article.ents)

2260

In [17]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 620,
         'PRODUCT': 14,
         'ORG': 401,
         'LOC': 33,
         'CARDINAL': 189,
         'DATE': 456,
         'GPE': 366,
         'MONEY': 5,
         'LANGUAGE': 3,
         'FAC': 10,
         'NORP': 73,
         'WORK_OF_ART': 22,
         'EVENT': 12,
         'ORDINAL': 31,
         'LAW': 14,
         'QUANTITY': 4,
         'PERCENT': 6,
         'TIME': 1})

In [18]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Pakistan', 129), ('Khan', 48), ('Ayub', 47)]

In [19]:
sentences = [x for x in article.sents]
print(sentences[20])

As president, Khan appointed Muhammad Musa to replace him as commander-in-chief.


In [20]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [21]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [22]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('president', 'NOUN', 'president'),
 ('Khan', 'PROPN', 'Khan'),
 ('appointed', 'VERB', 'appoint'),
 ('Muhammad', 'PROPN', 'Muhammad'),
 ('Musa', 'PROPN', 'Musa'),
 ('replace', 'VERB', 'replace'),
 ('commander', 'NOUN', 'commander'),
 ('chief', 'NOUN', 'chief')]

In [23]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Khan': 'PERSON', 'Muhammad Musa': 'PERSON'}

In [24]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(As, 'O', ''), (president, 'O', ''), (,, 'O', ''), (Khan, 'B', 'PERSON'), (appointed, 'O', ''), (Muhammad, 'B', 'PERSON'), (Musa, 'I', 'PERSON'), (to, 'O', ''), (replace, 'O', ''), (him, 'O', ''), (as, 'O', ''), (commander, 'O', ''), (-, 'O', ''), (in, 'O', ''), (-, 'O', ''), (chief, 'O', ''), (., 'O', '')]


In [25]:
from datasets import load_dataset 
from transformers import pipeline

In [26]:
summarizer = pipeline(
    task="summarization",
    model="t5-small",
    min_length=20,
    max_length=40,
    truncation=True,
    model_kwargs={"cache_dir": '/Documents/Huggin_Face/'},
)

In [40]:
# Generate the summary
summary = summarizer(str(sentences[18:25]), max_length=150, min_length=30, do_sample=False)[0]['summary_text']

bullet_points = summary.split(". ")

print("Actual Text: " + str(sentences[18:25]))

# Print the generated summary
print("Summary: " + summary)

Actual Text: [From 1953 to 1958, he served in the civilian government as Defence and Home Minister and supported president Iskandar Ali Mirza's decision to impose martial law against prime minister Feroze Khan's administration on 7 October 1958., Two weeks later, after a breakdown in civil–military relations, Khan seized presidency in a military coup, the first in the country's history., As president, Khan appointed Muhammad Musa to replace him as commander-in-chief., He aligned Pakistan with the United States, and allowed American access to air bases inside Pakistan, most notably the airbase outside of Peshawar, from which spy missions over the Soviet Union were launched., Relations with neighboring China were strengthened but his alignment with the US worsened relations with the Soviet Union in 1962., He launched Operation Gibraltar against India in 1965, leading to an all-out war., It resulted in a stalemate and peace was restored via the Tashkent Declaration.]
Summary: former prime

In [32]:
print(sentences[18:20])

[From 1953 to 1958, he served in the civilian government as Defence and Home Minister and supported president Iskandar Ali Mirza's decision to impose martial law against prime minister Feroze Khan's administration on 7 October 1958., Two weeks later, after a breakdown in civil–military relations, Khan seized presidency in a military coup, the first in the country's history.]
