In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from bs4.element import Tag, NavigableString

import pandas as pd
from tqdm import tqdm

In [2]:
def read_url_content_bs4(url):
    source = requests.get(url)
    soup = BeautifulSoup(source.content,'html.parser')
    soup.prettify()
    return soup 

In [3]:
url= "https://www.thehindu.com/news/national/telangana/aimim-hopes-to-create-muslim-leadership-in-up/article38260971.ece"
content = read_url_content_bs4(url)

In [4]:
def extract_para(soup_data):
    """extract the paragraph content present in the html source data in bs4.BeautifulSoup format. 
    """
    ## Find div[ids] for content-body id
    ids = [tag['id'] for tag in soup_data.select('div[id]')]
    content_body_id_name = [i for i in ids if "content-body" in i][0]
    article_content = soup_data.find("div",content_body_id_name)
    ## Get the article element
    article=soup_data.find('article')
    summary=article.find(article_content)
    
    ## A empty list to stores content
    all_content=[]
    for i in summary.find_all("p"):
        for content in i.contents:
            if isinstance(content, Tag):
                all_content.append(content.string)
            elif isinstance(content, NavigableString):
                c = content.string.strip()
                all_content.append(c)
            else:
                print(type(content))
                break
    final_para = "".join(map(str,all_content))
    return final_para

In [5]:
def extract_headline(soup_data):
    tag = soup_data.find_all("h1")[0]
    headline = tag.text.strip()
    return headline

In [6]:
def extract_reporter(soup_data):
    author = soup_data.find("div",class_="author-container hidden-xs").find('a', class_="auth-nm lnk").text
    return author

In [7]:
def extract_state(soup_data):
    state = soup_data.find("div",class_="article-exclusive").find('a',class_="section-name").text.strip()
    return state

In [8]:
def extract_city(soup_data):
    city = soup_data.find("div",class_="ut-container").find('span',class_="blue-color ksl-time-stamp dateline").text.strip()
    return city

In [9]:
def extract_date(soup_data):
    date=soup_data.find("div",class_="ut-container").find('span',class_="blue-color ksl-time-stamp").find('none').text
    return date

In [10]:
def extract_updated_date(soup_data):
    updated_date=soup_data.find("div",class_="teaser-text update-time").find('span').find('none').text
    return updated_date

In [11]:
def extract_content_from_url(url):
    content = read_url_content_bs4(url)
    data = {}
    data["headline"] = extract_headline(content)
    data["author"] = extract_reporter(content)
    data["state"] = extract_state(content)
    data["city"] = extract_city(content)
    data["date"] = extract_date(content)
    data["updated date"] = extract_updated_date(content)
    data["paragraph"] = extract_para(content)
    return data

In [12]:
urls = ["https://www.thehindu.com/news/national/telangana/aimim-hopes-to-create-muslim-leadership-in-up/article38260971.ece", 
        "https://www.thehindu.com/news/national/other-states/dissent-within-goa-bjp-on-taking-mgp-along-in-new-govt/article65214905.ece", 
        "https://www.thehindu.com/news/national/other-states/four-militants-killed-in-kashmir/article65216933.ece?homepage=true",
        "https://www.thehindu.com/news/national/nia-arrests-six-over-trafficking-of-rohingya-muslims/article65217070.ece?homepage=true",
        "https://www.thehindu.com/news/national/char-dham-road-project-sc-asks-ex-judge-sikri-to-head-panel/article65214798.ece",
        "https://www.thehindu.com/news/national/lakhimpur-kheri-casefarmers-families-seek-urgent-hearing-of-plea-against-bail-to-union-ministers-son/article65213784.ece",
        "https://www.thehindu.com/news/national/students-stranded-in-sumy-finally-return/article65214079.ece",
        "https://www.thehindu.com/business/markets/gold-tumbles-200-silver-jumps-193/article65214314.ece",
        "https://www.thehindu.com/business/Industry/indias-sugar-exports-accelerate-on-global-price-rally-weak-rupee/article65212106.ece"]

In [13]:
content=read_url_content_bs4(url)

In [14]:
para=extract_para(content)
para

'The All India Majlis-e-Ittehadul Muslimeen threw itself into the fray in the election-bound State of Uttar Pradesh and has organised dozens of rallies and public meetings. The party has emphasised the importance of Muslim leadership in that State and underscored the socio-economic and political conditions of the minorities.Muslims in Uttar Pradesh constitute approximately 20% of the total population.At a recent public meeting in U.P., AIMIM party president Asaduddin Owaisi said, "It is our objective to create a Muslim leadership and at the same time see to it that the BJP does not come to power and that Yogi Adityanath does not become the Chief Minister again."While urging people to vote for his party\'s candidates, he said that the AIMIM would work with the people of U.P. He also stated that it is time for people of that State to come forward and become leaders and that he would stand behind them. He said that if the minorities want to remain politically relevant, they will have to d

In [15]:
content = [extract_content_from_url(i) for i in tqdm(urls)]

100%|█████████████████████████████████████████████| 9/9 [00:02<00:00,  4.02it/s]


In [16]:
df = pd.DataFrame(content)
df.shape

(9, 7)

In [17]:
df.head(10)

Unnamed: 0,headline,author,state,city,date,updated date,paragraph
0,AIMIM hopes to create Muslim leadership in U.P.,Staff Reporter,Telangana,HYDERABAD,"January 12, 2022 23:58 IST","January 12, 2022 23:59 IST",The All India Majlis-e-Ittehadul Muslimeen thr...
1,Dissent within Goa BJP on taking MGP along in ...,Shoumojit Banerjee,Other States,Pune,"March 11, 2022 20:28 IST","March 11, 2022 20:28 IST",BJP’s Goa election in-charge Devendra Fadnavis...
2,Four militants killed in three operations in J&K,Peerzada Ashiq,Other States,SRINAGAR,"March 12, 2022 11:07 IST","March 12, 2022 23:10 IST",Security personnel stand guard after a grenade...
3,NIA arrests six over trafficking of Rohingya M...,Devesh K. Pandey,National,NEW DELHI,"March 12, 2022 12:55 IST","March 12, 2022 13:36 IST",A Border Security Force official questions Roh...
4,Char Dham road project: Supreme Court asks ex-...,Legal Correspondent,National,NEW DELHI,"March 11, 2022 19:59 IST","March 12, 2022 00:47 IST",Rampant cutting of the hills near Rudraprayag ...
5,Lakhimpur witness ‘attacked’: plea,Legal Correspondent,National,NEW DELHI,"March 11, 2022 12:57 IST","March 12, 2022 00:48 IST",Ashish Mishra (centre) and other accused in th...
6,600 Indian students stranded in Sumy finally b...,Special Correspondent,National,NEW DELHI,"March 11, 2022 16:18 IST","March 12, 2022 00:49 IST",Students who were rescued from the conflict zo...
7,Gold tumbles ₹ 200; silver jumps ₹ 193,PTI,Markets,New Delhi,"March 11, 2022 17:20 IST","March 11, 2022 17:20 IST",Woman trying gold ornaments at Musaddilal Jewe...
8,India's sugar exports accelerate on global pri...,Reuters,Industry,MUMBAI,"March 10, 2022 21:17 IST","March 10, 2022 23:11 IST",A worker checks the flow of sugar inside the G...
