## Extracting the gist of top 2 stories from the US News website

### Importing the required packages
1. Beautiful Soup from bs4 for web scraping
2. get from requests for simulating web requests
3. nltk for natural language processing

In [1]:
from bs4 import BeautifulSoup
from requests import get
import nltk

### Setting up the parameters for web request, namely website url and headers

In [2]:
# URL of the websiteto be scraped
url = "https://www.usnews.com"
# Adding user agent as Mozilla to make the server believe that the request came from a browser
user_agent = 'Mozilla/5.0'
# Adding the user agent to the request header
headers = {'User-Agent': user_agent}
# Requesting the usnews website acess to the server and storing the response
res = get(url, headers = headers)

In [3]:
# Parsing the response and storing the resulting html
data = BeautifulSoup(res.text, 'html.parser')

### For simplification, I have used the class name of the left tab to extract the top 2 stories. This might need an update based on the website changes

In [4]:
# Using the class name of the left side tab, getting the corresponding html code for the top 2 stories
result = data.find(class_ = "ArmRestTopStories__Part-s13c9i18-1 joBuNB Box-s1krs5yn-0 bFmVmh")
# Getting the html code for the headings of top 2 stories using the h3 tag
heading = result.findAll("h3")

In [5]:
story_url = []
# Getting the link for the top 2 stories by using the href content from "a" tag and making a list
for i in range(len(heading)):
    story_url.append(heading[i].find("a").get("href"))

### Request for the actual news webpage and extract a gist of it

In [6]:
def get_gist(url):
    # Creating a placeholder to store the final result, which is the first 3 sentences
    final_res = ""
    # Requesting the top story website acess to the server and storing the response
    response1 = get(url, headers = headers)
    # Parsing the response and storing the resulting html
    data1 = BeautifulSoup(response1.text, "html.parser")
    # Scraping the header of the 2nd top story using the "h1" tag and storing it in the result
    final_res = final_res + " " + data1.find('h1').get_text()
    # Scraping the sub-header of the 2nd top story using the "h2" tag and storing it in the result
    final_res = final_res + " " + data1.find('h2').get_text()
    # Using the class name of the content body, getting the corresponding html code
    content = data1.find(class_ = "ArticleBody__ArticleBox-s4gdqwu-2 dOjcJJ Box-s1krs5yn-0 ewBkVU")
    # Getting the first 4 paragraphs from the content using the "p" tag without any class attribute (to remove the external links)
    res1 = content.findAll("p", class_ = "", limit = 4)
    # Concatenating the contents of the first 4 paragraphs with spaces between them (as the first 3 lines would be within max of 4 paragraphs)
    result = res1[0].get_text() + " " + res1[1].get_text() + " " + res1[2].get_text() + " " + res1[3].get_text()
    # Using the sentence tokenizer from the nltk package, splitting the contents into sentences.
    sentences = nltk.sent_tokenize(result)
    # Looping over the tokenized output from 0 - 2, to get the first 3 data
    for i in range(5):
        # Concatenating each new line to the placeholder result variable
        final_res = final_res + " " + sentences[i]
    return final_res

In [7]:
# Printing the gist of the top 2 sotries
for i in range(len(story_url)):
    print("Story {0} : {1}\n".format(i, get_gist(story_url[i])))

Story 0 :  Stay-At-Home Order Issued in Delaware Starting Tuesday Delaware Gov. John Carney has issued a stay-at-home order taking effect on Tuesday that closes non-essential businesses to attempt to blunt the intensity of the spread of the new coronavirus. WILMINGTON, Del. (AP) — Delaware Gov. John Carney on Sunday issued a stay-at-home order, closing "non-essential" businesses to attempt to blunt the intensity of the spread of the new coronavirus. Carney's emergency declaration takes effect at 8 a.m. Tuesday. Under the order, Delaware's 975,000 residents who otherwise don't work at the exempted businesses will be allowed only to leave their homes to get groceries or a prescription or see a doctor.

Story 1 :  Rand Paul Becomes First Senator to Test Positive for Coronavirus The Kentucky Republican said he’s asymptomatic and will return to the Senate after his quarantine period. Republican Sen. Rand Paul of Kentucky announced Sunday that he has tested positive for the coronavirus, beco