In [None]:
## install relevant libraries

!pip install -q bert-extractive-summarizer
!pip install -q neuralcoref
!pip install -q transformers
!pip3 install -q news-please
!pip3 install -q cchardet
!pip install -q keybert
!pip install -q beautifulsoup4

In [None]:
## import necesary libraries

from colab import *
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
from urllib.parse import urlparse
from newsplease import NewsPlease
from keybert import KeyBERT
from transformers import pipeline

**Tweet**

In [None]:
tweet = "Pakistan PM Imran Khan tests positive for Covid19 48 hours after getting the Chinese Sinopharm vaccine shot. Under observation."
print(tweet)

Pakistan PM Imran Khan tests positive for Covid19 48 hours after getting the Chinese Sinopharm vaccine shot. Under observation.


# **Step 1**: Extracting keywords from the tweet

In [None]:
from keybert import KeyBERT
model = KeyBERT('distilbert-base-nli-mean-tokens')

In [None]:
keywords = model.extract_keywords(tweet, keyphrase_ngram_range=(1,1))
keywords = [i[0] for i in keywords]
keywords = "+".join(keywords)

print(keywords)

vaccine+chinese+covid19+pakistan+sinopharm


# **Step 2**: Extracting relevant news articles wrt keywords

In [None]:
url = "https://www.google.com/search?q=" + keywords + "&tbm=nws"
url

'https://www.google.com/search?q=vaccine+chinese+covid19+pakistan+sinopharm&tbm=nws'

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse
from urllib.parse import urlparse
def googleSearch(url):
    g_clean = [ ] 
    
    try:
        html = requests.get(url)
        if html.status_code==200:
            soup = BeautifulSoup(html.text, 'lxml')
            a = soup.find_all('a') #// a is a list
            for i in a:
                k = i.get('href')
                try:
                    m = re.search("(?P<url>https?://[^\s]+)", k)
                    n = m.group(0)
                    rul = n.split('&')[0]
                    domain = urlparse(rul)
                    if (re.search('google.com', domain.netloc)):
                        continue
                    else:
                        g_clean.append(rul)
                except:
                    continue
    except Exception as ex:
        print(str(ex))
    finally:
        return g_clean

In [None]:
news_urls = googleSearch(url)

In [None]:
len(news_urls), news_urls[:3]

(12,
 ['https://www.aljazeera.com/news/2021/3/18/pakistan-receives-second-batch-of-500000-vaccines-from-china',
  'https://www.nytimes.com/2021/03/16/world/covid-vaccine-china-hamster-ovaries.html',
  'https://www.aljazeera.com/news/2021/3/20/pakistan-prime-minister-imran-khan-tests-positive-for-covid'])

**Scraping the news from these urls**

In [None]:
from newsplease import NewsPlease

news_content = []


for url in news_urls:
    try:
        article = NewsPlease.from_url(url)
        title = article.title
        description = article.description
        main_text = article.maintext

        content = str(title) + " " + str(description) + " " + str(main_text)
        news_content.append(content)
    except:
        continue
    
    if (len(news_content) >= 5):
        break

In [None]:
news_content

["Pakistan receives second batch of 500,000 vaccines from China Second donation of Sinopharm coronavirus vaccines brings the country’s total supply to one million shots. Second donation of Sinopharm coronavirus vaccines brings the country’s total supply to one million shots.\nPakistan has received a Chinese donation of 500,000 doses of Sinopharm vaccine, bringing the country’s total supply to one million shots, Health Minister Dr Faisal Sultan said.\nThe South Asian nation of 220 million people launched COVID-19 vaccinations for the public on March 10, starting with older people. Health workers started receiving shots in early February.\n“These 500,000 doses will ensure smooth continuation of our vaccine drive, currently under way for senior citizens,” Sultan said in a tweet on Wednesday.\nThe next batch of Sinopharm vaccine was received today. These 500,000 doses will ensure smooth continuation of our vaccination drive, currently underway for senior citizens. We're grateful to China f

# **Step 3**: Summarizing from these news_content

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization")

  f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions."


In [None]:
summarized_news = []

for i in news_content:
    try:
      summary = summarizer(i, max_length=250, min_length=30, do_sample=False)
      summarized_news.append(summary[0]['summary_text'])
    except:
      summarized_news.append("")


In [None]:
# now summarizing all the summaries

combined_summary = summarizer(" ".join(summarized_news), max_length=250, min_length=30, do_sample=False)
combined_summary = combined_summary[0]['summary_text']


In [None]:
combined_summary

' Pakistan receives second batch of 500,000 vaccines from China . Second donation of Sinopharm coronavirus vaccines brings the country’s total supply to one million . Sixty-one more people died overnight, taking the death toll to 13,656, according to the health ministry . Pakistan PM Imran Khan tests positive for COVID-19, his office said .'



---



# **Combining everything together in 1 function**

In [None]:
model = KeyBERT('distilbert-base-nli-mean-tokens')
summarizer = pipeline("summarization")

In [None]:
# defining helper functions

def return_keywords(tweet):
    keywords = model.extract_keywords(tweet, keyphrase_ngram_range=(1,1))
    keywords = [i[0] for i in keywords]
    keywords = "+".join(keywords)

    return keywords


def get_news_urls(keywords):
    url = "https://www.google.com/search?q=" + keywords # + "&tbm=nws"
    g_clean = [ ] 
    
    try:
        html = requests.get(url)
        if html.status_code==200:
            soup = BeautifulSoup(html.text, 'lxml')
            a = soup.find_all('a') #// a is a list
            for i in a:
                try:
                    m = re.search(r'/url[?]q=(.*?)&amp', str(i)).group(1)
                    if m not in g_clean:
                        g_clean.append(m)
                except:
                    continue
    except Exception as ex:
        print(str(ex))
    finally:
        return g_clean

def get_news_content(news_urls, top_n = 5):
    news_content = []

    for url in news_urls:
        try:
            article = NewsPlease.from_url(url)
            title = article.title
            description = article.description
            main_text = article.maintext

            content = str(title) + " " + str(description) + " " + str(main_text)
            news_content.append(content)
            print(url)
        except:
            continue
        
        if (len(news_content) >= top_n):
            break
    
    return news_content

def get_summarized_news(news_content):
    summarized_news = []

    for i in news_content:
        try:
          summary = summarizer(i, max_length=250, min_length=30, do_sample=False)
          summarized_news.append(summary[0]['summary_text'])
        except:
          summarized_news.append("")
    
    # now summarizing all the summaries
    combined_summary = summarizer(" ".join(summarized_news), max_length=250, min_length=30, do_sample=False)
    combined_summary = combined_summary[0]['summary_text']

    return combined_summary

In [None]:
def get_summary(tweet):
    # Step 1: Extract keywords
    keywords = "+".join(list(map(str, tweet.split(" "))))
    # Step 2: get_news_urls
    news_urls = get_news_urls(keywords)
    # Step 3: get_news_content
    news_content = get_news_content(news_urls, top_n = 4)
    # Step 4 (final step): get_summarized_news
    summary = get_summarized_news(news_content) 
    return summary

In [None]:
# get latest fake news from https://www.altnews.in/

tweet = "It is mandatory to apply masks in all police station areas of Uttar Pradesh from 9 am to 30 days tomorrow. A person who will be caught without a mask should remain in temporary jail for 10 hours. Put on masks for two yards, of yourself and family with a corona-like illness Save."
summary = get_summary(tweet)
print("-------------")
print(tweet)
print("-------------")
print(summary)

https://www.hindustantimes.com/india-news/no-30-day-mask-checking-in-state-up-police-clarify-after-fake-message-goes-viral-101614332110316.html
https://zeenews.india.com/india/fact-check-up-police-clarifies-on-mask-checking-campaign-terms-social-media-post-as-fake-2344478.html
https://www.modernhealthcare.com/safety-quality/live-updates-covid-19-september-16-30
https://blockclubchicago.org/2020/06/06/police-lieutenant-says-cps-has-ditched-masks-other-coronavirus-safety-measures-amid-protests-we-have-zero-protocol-being-followed-right-now/
https://www.washingtonpost.com/world/2020/03/30/coronavirus-latest-news/


Your max_length is set to 250, but you input_length is only 138. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 250, but you input_length is only 184. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


-------------
It is mandatory to apply masks in all police station areas of Uttar Pradesh from 9 am to 30 days tomorrow. A person who will be caught without a mask should remain in temporary jail for 10 hours. Put on masks for two yards, of yourself and family with a corona-like illness Save.
-------------
 Fake message claimed that a 30-day mask checking campaign will be launched in every police station area in Uttar Pradesh . UP police refuted the claims and said it has neither undertaken any such campaign and nor has circulated any information regarding the same .


# Experiment

In [None]:
# Step 1: Extract keywords
keywords = "+".join(list(map(str, tweet.split(" "))))

# Step 2: get_news_urls
news_urls = get_news_urls(keywords)

In [None]:
keywords

"India+won+the+match+by+5+wickets+and+has+levelled+the+series+by+1-1.+Great+to+see+Virat+Kohli+and+ishan+kishan+'s+knock."

In [None]:
news_urls

['https://www.hindustantimes.com/cricket/india-vs-england-live-cricket-score-3rd-t20-ind-vs-eng-live-match-today-latest-score-updates-ahmedabad-101615893593016.html',
 'https://www.mykhel.com/cricket/india-vs-england-2nd-t20i-live-updates-india-look-to-bounce-back-in-t20-series-163128.html',
 'http://newsonair.com/Text-Bulletin-Details.aspx%3Fid%3D36081',
 'https://www.indiafantasy.com/football/fantasy-football/pue-vs-atl-dream11-13-march/',
 'https://www.indiafantasy.com/nba/fantasy-nba/uta-vs-hou-dream11-team-prediction-nba-projections-fantasy-basketball-picks-for-jazz-vs-rockets-13-march/',
 'https://www.indiafantasy.com/cricket/fantasy-cricket/bd-l-vs-wi-l-dream11-12th-match-12-march/',
 'https://www.indiafantasy.com/india-fantasy-match-of-the-day/blm-vs-snd-dream11-prediction-match-of-the-day-9-march/',
 'https://www.indiafantasy.com/football/fantasy-football/mlc-vs-mac-dream11-12-march/',
 'https://www.hindustantimes.com/cricket/india-vs-england-last-7-8-years-he-s-got-at-least-5

In [None]:
get_news_urls(keywords)

['https://www.hindustantimes.com/cricket/india-vs-england-live-cricket-score-3rd-t20-ind-vs-eng-live-match-today-latest-score-updates-ahmedabad-101615893593016.html',
 'https://www.mykhel.com/cricket/india-vs-england-2nd-t20i-live-updates-india-look-to-bounce-back-in-t20-series-163128.html',
 'http://newsonair.com/Text-Bulletin-Details.aspx%3Fid%3D36081',
 'https://www.indiafantasy.com/football/fantasy-football/pue-vs-atl-dream11-13-march/',
 'https://www.indiafantasy.com/nba/fantasy-nba/uta-vs-hou-dream11-team-prediction-nba-projections-fantasy-basketball-picks-for-jazz-vs-rockets-13-march/',
 'https://www.indiafantasy.com/cricket/fantasy-cricket/bd-l-vs-wi-l-dream11-12th-match-12-march/',
 'https://www.indiafantasy.com/india-fantasy-match-of-the-day/blm-vs-snd-dream11-prediction-match-of-the-day-9-march/',
 'https://www.indiafantasy.com/football/fantasy-football/mlc-vs-mac-dream11-12-march/',
 'https://www.hindustantimes.com/cricket/india-vs-england-last-7-8-years-he-s-got-at-least-5

In [None]:
url = "https://www.google.com/search?q=" + keywords + "&tbm=nws"

In [None]:
html = requests.get(url)

soup = BeautifulSoup(html.text, 'lxml')
a = soup.find_all('a') #// a is a list

In [None]:
a

[<a href="/?sa=X&amp;ved=0ahUKEwjXm7q3hb_vAhXSasAKHbUEBnoQOwgC"><span class="V6gwVd">G</span><span class="iWkuvd">o</span><span class="cDrQ7">o</span><span class="V6gwVd">g</span><span class="ntlR9">l</span><span class="iWkuvd tJ3Myc">e</span></a>,
 <a class="l" href="/?output=search&amp;ie=UTF-8&amp;tbm=nws&amp;sa=X&amp;ved=0ahUKEwjXm7q3hb_vAhXSasAKHbUEBnoQPAgE"><span class="V6gwVd">G</span><span class="iWkuvd">o</span><span class="cDrQ7">o</span><span class="V6gwVd">g</span><span class="ntlR9">l</span><span class="iWkuvd tJ3Myc">e</span></a>,
 <a href="/search?q=Indian+Railways+Cancels+All+Passenger+Trains+Till+March+31+%7C+ABP+News&amp;tbm=nws&amp;ie=UTF-8&amp;gbv=1&amp;sei=ZwFWYJfMItLVgQa1iZjQBw">here</a>,
 <a class="eZt8xd" href="/search?q=Indian+Railways+Cancels+All+Passenger+Trains+Till+March+31+%7C+ABP+News&amp;ie=UTF-8&amp;source=lnms&amp;sa=X&amp;ved=0ahUKEwjXm7q3hb_vAhXSasAKHbUEBnoQ_AUIBygA">All</a>,
 <a class="eZt8xd" href="/search?q=Indian+Railways+Cancels+All+Passenger+Tr

In [None]:
a[20]

<a href="/url?q=https://in.news.yahoo.com/indian-railways-cancels-trains-till-152422235.html&amp;sa=U&amp;ved=2ahUKEwjXm7q3hb_vAhXSasAKHbUEBnoQ0Y8FMAF6BAgJEAI&amp;usg=AOvVaw2Vns7uyMDrpztsFxV9rixI"><div class="lcJF1d SXn0g GXKcHe"><div style="width:120px;height:67px;position:static"></div></div></a>

In [None]:
for i in a:
  try:
    print(re.search(r'/url[?]q=(.*?)&amp', str(i)).group(1))
  except:
    continue

https://www.freepressjournal.in/india/fpj-fact-check-has-indian-railways-cancelled-all-passenger-trains-till-march-31-2021
https://www.freepressjournal.in/india/fpj-fact-check-has-indian-railways-cancelled-all-passenger-trains-till-march-31-2021
https://in.news.yahoo.com/indian-railways-cancels-trains-till-152422235.html
https://in.news.yahoo.com/indian-railways-cancels-trains-till-152422235.html
https://www.energyinfrapost.com/right-track-heres-why-indian-railways-needs-big-doses-of-private-investment/
https://www.energyinfrapost.com/indian-railways-eastern-western-dfcs-to-be-operational-by-indias-75th-year-of-independence/
https://www.energyinfrapost.com/indian-railways-eastern-western-dfcs-to-be-operational-by-indias-75th-year-of-independence/
https://www.india.com/news/india/irctc-indian-railways-latest-news-cancelled-trains-on-holi-cancelled-trains-on-uttar-pradesh-madhya-pradesh-bengal-route-check-full-list-of-cancelled-trains-here-4439205/
https://www.india.com/news/india/irctc-