# Text Summarization of the Webpage


### TASK-01

In [1]:
import bs4 as bs
import urllib.request
import sys
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
#import spacy
import pandas as pd
import numpy as np
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [2]:
def scrape_webpage(url):
  '''
  Function for scraping webpage
  '''
  # Open connection
  connection = urllib.request.urlopen(url)

  # Read data
  data = connection.read()

  # Parse xml
  parsed = bs.BeautifulSoup(data, 'lxml')

  # Get paragraphs
  paragraphs = parsed.find_all('p')

  # Get text from paragraphs
  text = [p.text for p in paragraphs]

  # Merge paragraph text into single string
  text = '\n'.join(text)
  return text

#### 1.1  Web Scraping 

In [3]:
url = 'https://www.nbcnews.com/news/world/hong-kong-finance-summit-covid-rcna55498'

text = scrape_webpage(url)

print(text[:5000], '...')


Profile
Sections
tv
Featured
More From NBC
Follow NBC News
HONG KONG — After mass unrest in 2019, a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent, Hong Kong is ready to turn the page. 
“Social disturbance is clearly in the past,” the city’s leader, John Lee, said Wednesday at the Four Seasons Hotel, where about 200 finance industry executives from around the world were gathered for a summit. “It has given way to stability, to growing business and community confidence in Hong Kong’s future. Law and order has returned; the worst is behind us.”
But the summit — meant to signal that the Chinese territory long known as a regional financial powerhouse is again open for business — has been shadowed by some awkward realities. 
Hong Kong’s border with mainland China, its main economic driver, remains tightly closed because of pandemic restrictions. Participants in the summit, many of them American companies, have been criti

In [4]:

# Get sentences
sentences_original = sent_tokenize(text)
print('')
print('Sentence examples, original')
for i in range(0, 5):
  print(i, sentences_original[i])

print('')
print('Sentence count', len(sentences_original))


Sentence examples, original
0 
Profile
Sections
tv
Featured
More From NBC
Follow NBC News
HONG KONG — After mass unrest in 2019, a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent, Hong Kong is ready to turn the page.
1 “Social disturbance is clearly in the past,” the city’s leader, John Lee, said Wednesday at the Four Seasons Hotel, where about 200 finance industry executives from around the world were gathered for a summit.
2 “It has given way to stability, to growing business and community confidence in Hong Kong’s future.
3 Law and order has returned; the worst is behind us.”
But the summit — meant to signal that the Chinese territory long known as a regional financial powerhouse is again open for business — has been shadowed by some awkward realities.
4 Hong Kong’s border with mainland China, its main economic driver, remains tightly closed because of pandemic restrictions.

Sentence count 46


#### 1.2 PreProcess The Data

In [5]:
sentences = [re.sub('\[\d+\]', ' ', sent) for sent in sentences_original]
print('')
print('Sentence examples, rm cite')
for i in range(0, 5):
  print(i, sentences[i])

print('')
print('Sentence count', len(sentences))


Sentence examples, rm cite
0 
Profile
Sections
tv
Featured
More From NBC
Follow NBC News
HONG KONG — After mass unrest in 2019, a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent, Hong Kong is ready to turn the page.
1 “Social disturbance is clearly in the past,” the city’s leader, John Lee, said Wednesday at the Four Seasons Hotel, where about 200 finance industry executives from around the world were gathered for a summit.
2 “It has given way to stability, to growing business and community confidence in Hong Kong’s future.
3 Law and order has returned; the worst is behind us.”
But the summit — meant to signal that the Chinese territory long known as a regional financial powerhouse is again open for business — has been shadowed by some awkward realities.
4 Hong Kong’s border with mainland China, its main economic driver, remains tightly closed because of pandemic restrictions.

Sentence count 46


In [6]:
sentences = [sent.lower() for sent in sentences]
print('')
print('Sentence examples, lower')
for i in range(0, 5):
  print(i, sentences[i])

print('')
print('Sentence count', len(sentences))


Sentence examples, lower
0 
profile
sections
tv
featured
more from nbc
follow nbc news
hong kong — after mass unrest in 2019, a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent, hong kong is ready to turn the page.
1 “social disturbance is clearly in the past,” the city’s leader, john lee, said wednesday at the four seasons hotel, where about 200 finance industry executives from around the world were gathered for a summit.
2 “it has given way to stability, to growing business and community confidence in hong kong’s future.
3 law and order has returned; the worst is behind us.”
but the summit — meant to signal that the chinese territory long known as a regional financial powerhouse is again open for business — has been shadowed by some awkward realities.
4 hong kong’s border with mainland china, its main economic driver, remains tightly closed because of pandemic restrictions.

Sentence count 46


In [7]:
sentences = [sent.strip() for sent in sentences]
print('')
print('Sentence examples, strip')
for i in range(0, 5):
  print(i, sentences[i])


print('')
print('Sentence count', len(sentences))


Sentence examples, strip
0 profile
sections
tv
featured
more from nbc
follow nbc news
hong kong — after mass unrest in 2019, a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent, hong kong is ready to turn the page.
1 “social disturbance is clearly in the past,” the city’s leader, john lee, said wednesday at the four seasons hotel, where about 200 finance industry executives from around the world were gathered for a summit.
2 “it has given way to stability, to growing business and community confidence in hong kong’s future.
3 law and order has returned; the worst is behind us.”
but the summit — meant to signal that the chinese territory long known as a regional financial powerhouse is again open for business — has been shadowed by some awkward realities.
4 hong kong’s border with mainland china, its main economic driver, remains tightly closed because of pandemic restrictions.

Sentence count 46


In [8]:
#No punctuations
sentences = [re.sub(r'[^\w\s]', ' ', sent) for sent in sentences]
print('')
print('Sentence examples, no punct')
for i in range(0, 5):
  print(i, sentences[i])

print('')
print('Sentence count', len(sentences))


Sentence examples, no punct
0 profile
sections
tv
featured
more from nbc
follow nbc news
hong kong   after mass unrest in 2019  a pandemic that left it isolated from the world and the imposition of a national security law that has crushed dissent  hong kong is ready to turn the page 
1  social disturbance is clearly in the past   the city s leader  john lee  said wednesday at the four seasons hotel  where about 200 finance industry executives from around the world were gathered for a summit 
2  it has given way to stability  to growing business and community confidence in hong kong s future 
3 law and order has returned  the worst is behind us  
but the summit   meant to signal that the chinese territory long known as a regional financial powerhouse is again open for business   has been shadowed by some awkward realities 
4 hong kong s border with mainland china  its main economic driver  remains tightly closed because of pandemic restrictions 

Sentence count 46


In [9]:
# tokenising
tokens = [word_tokenize(sent) for sent in sentences]
print('')
print('Sentence examples, len > 0')
for i in range(0, 5):
  print(i, tokens[i])

print('')
print('Sentence count', len(tokens))


Sentence examples, len > 0
0 ['profile', 'sections', 'tv', 'featured', 'more', 'from', 'nbc', 'follow', 'nbc', 'news', 'hong', 'kong', 'after', 'mass', 'unrest', 'in', '2019', 'a', 'pandemic', 'that', 'left', 'it', 'isolated', 'from', 'the', 'world', 'and', 'the', 'imposition', 'of', 'a', 'national', 'security', 'law', 'that', 'has', 'crushed', 'dissent', 'hong', 'kong', 'is', 'ready', 'to', 'turn', 'the', 'page']
1 ['social', 'disturbance', 'is', 'clearly', 'in', 'the', 'past', 'the', 'city', 's', 'leader', 'john', 'lee', 'said', 'wednesday', 'at', 'the', 'four', 'seasons', 'hotel', 'where', 'about', '200', 'finance', 'industry', 'executives', 'from', 'around', 'the', 'world', 'were', 'gathered', 'for', 'a', 'summit']
2 ['it', 'has', 'given', 'way', 'to', 'stability', 'to', 'growing', 'business', 'and', 'community', 'confidence', 'in', 'hong', 'kong', 's', 'future']
3 ['law', 'and', 'order', 'has', 'returned', 'the', 'worst', 'is', 'behind', 'us', 'but', 'the', 'summit', 'meant', 'to

In [10]:

stop = set(stopwords.words('english'))

tokens_flat_1 = [tok for sent in tokens for tok in sent if tok not in stop]

lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(w) for w in tokens_flat_1]
lemmatized_words
print("No of words after lemmatizing:", len(lemmatized_words))
freq_dist = nltk.FreqDist(lemmatized_words)
print("Most frequent words:")
freq_dist.most_common(10)

No of words after lemmatizing: 824
Most frequent words:


[('hong', 31),
 ('kong', 31),
 ('business', 16),
 ('said', 14),
 ('law', 11),
 ('security', 8),
 ('national', 7),
 ('summit', 7),
 ('china', 7),
 ('lee', 6)]

#### 1.4 Score the sentences

In [11]:
sentence_scores = []
for sent in tokens:
  sent_score = 0
  for tok in sent:
    if tok not in stop:
      sent_score += freq_dist[tok]
  sentence_scores.append(sent_score)

print('sentence_scores', sentence_scores)
print('sentence_scores, len', len(sentence_scores))


sentence_scores [189, 54, 89, 61, 83, 32, 20, 26, 95, 34, 151, 39, 23, 96, 4, 76, 9, 13, 90, 28, 162, 24, 75, 300, 78, 49, 73, 219, 29, 55, 33, 137, 142, 83, 191, 108, 91, 156, 93, 35, 12, 31, 22, 182, 201, 6]
sentence_scores, len 46


#### 1.5 Rank the sentences and Text summarization


In [12]:
N = len(sentence_scores)
assert len(sentence_scores) == len(sentences)
assert len(sentence_scores) == len(sentences_original), f'{len(sentence_scores)} vs {len(sentences_original)}'
sentence_pairs = list(zip(range(N), sentence_scores, sentences))
sentence_pairs.sort(key = lambda x: x[1], reverse=True)
print('Top, preprocess:  ', sentence_pairs)

i, score, sent = sentence_pairs[0]
print('Top, origial:     ', sentences_original[i])

Top, preprocess:   [(23, 300, 'while it may not be realistic to expect businesses to turn away from china s huge market  global business leaders  need to recognize that there s a new situation in hong kong  there s a new reality   said brian kern  the lead researcher for a report on doing business in hong kong that was published last month by the hong kong democracy council  a nonprofit group based in washington '), (27, 219, 'an article in the chinese state backed nationalist tabloid global times on thursday said the gathering  served as a strong rebuttal to the hype and bad mouthing of the city s status  \nhong kong is  not going anywhere  as an international financial center  said allan zeman  chairman of the lan kwai fong group  a hong kong based real estate developer '), (44, 201, ' as long as we remove the travel restrictions  business and leisure travelers are going to come back   said heiwai tang  an economics professor at the university of hong kong   and then we ll be back to