### Scraping Road Accident News from Dhaka Tribune using BeautifulSoup

In [1]:
# Import libraries

import requests
import json
import pandas as pd 
import time
from bs4 import BeautifulSoup as bs # 
from bs4 import BeautifulSoup

In [2]:
# The base URL
url = "https://www.dhakatribune.com/hashtag/road-accident"

In [3]:
# Creating a BeautifulSoup object
def make_soup(url):
    
    doc = requests.get(url)
    if str(doc) == "<Response [200]>":
        # create a soup object that contains the navigable html presentation of the page
        soup = bs(doc.content, 'html.parser')
        print(f"Retrieved url: {url}")
    else:
        print(f"{url} cannot be reached.")   
        
    return soup

In [4]:
soup = make_soup(url)

Retrieved url: https://www.dhakatribune.com/hashtag/road-accident


In [5]:
# What's the URL of the second page?
pagination = soup.find("div", attrs={"class": "page-pagination-area"})
next_page_url = "https://www.dhakatribune.com" + pagination.find_all('a')[1].get('href')
next_page_url

'https://www.dhakatribune.com/hashtag/road-accident/page/2'

In [6]:
# And what about the last page?
next_page_url = "https://www.dhakatribune.com" + pagination.find_all('a')[-1].get('href')
next_page_url

'https://www.dhakatribune.com/hashtag/road-accident/page/48'

In [7]:
# Let's put together all the URL that we're gonna use
next_urls = [] 

for i in range(2, 48):
    next_urls.append('https://www.dhakatribune.com/hashtag/road-accident/page/{}'.format(i))
    
all_urls = [url] + next_urls
all_urls

['https://www.dhakatribune.com/hashtag/road-accident',
 'https://www.dhakatribune.com/hashtag/road-accident/page/2',
 'https://www.dhakatribune.com/hashtag/road-accident/page/3',
 'https://www.dhakatribune.com/hashtag/road-accident/page/4',
 'https://www.dhakatribune.com/hashtag/road-accident/page/5',
 'https://www.dhakatribune.com/hashtag/road-accident/page/6',
 'https://www.dhakatribune.com/hashtag/road-accident/page/7',
 'https://www.dhakatribune.com/hashtag/road-accident/page/8',
 'https://www.dhakatribune.com/hashtag/road-accident/page/9',
 'https://www.dhakatribune.com/hashtag/road-accident/page/10',
 'https://www.dhakatribune.com/hashtag/road-accident/page/11',
 'https://www.dhakatribune.com/hashtag/road-accident/page/12',
 'https://www.dhakatribune.com/hashtag/road-accident/page/13',
 'https://www.dhakatribune.com/hashtag/road-accident/page/14',
 'https://www.dhakatribune.com/hashtag/road-accident/page/15',
 'https://www.dhakatribune.com/hashtag/road-accident/page/16',
 'https:

In [8]:
# Let's scrape the links of each news

links = []

for u in all_urls:
    
    soup = make_soup(u)
    container = soup.find("div", attrs={"class": "listing-page-news listing-page-info"})
        
    link = []
    
    for row in container.find_all('a'):
        l = row.attrs['href']
        link.append(l)
        
    links_ = ["https://www.dhakatribune.com" + x for x in link if '/articles/' not in x and '#' not in x]   
    links = links + links_
    
outputs = []
        
for x in links:
    if x not in outputs:
        outputs.append(x)

Retrieved url: https://www.dhakatribune.com/hashtag/road-accident
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/2
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/3
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/4
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/5
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/6
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/7
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/8
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/9
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/10
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/11
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/12
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/13
Retrieved url: https://www.dhakatribune.com/hashtag/ro

In [9]:
# Now the headlines of each news

headings = []

for u in all_urls:
    
    soup = make_soup(u)
    container = soup.find("div", attrs={"class": "listing-page-news listing-page-info"})
    
    for row in container.find_all('h4', attrs={"class": "news-title"}):
        # getting the heading
        heading = row.text.strip()
        headings.append(heading)

Retrieved url: https://www.dhakatribune.com/hashtag/road-accident
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/2
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/3
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/4
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/5
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/6
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/7
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/8
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/9
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/10
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/11
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/12
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/13
Retrieved url: https://www.dhakatribune.com/hashtag/ro

In [10]:
# Let's also look at the date when the news got published

times = []

for u in all_urls:
    
    soup = make_soup(u)
    container = soup.find("div", attrs={"class": "listing-page-news listing-page-info"})
    
    for row in container.find_all('h4'):
        # getting the publish time
        time = row.text
        times.append(time)
    
    times = [x.strip() for x in times if '\n' not in x]

Retrieved url: https://www.dhakatribune.com/hashtag/road-accident
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/2
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/3
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/4
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/5
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/6
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/7
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/8
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/9
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/10
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/11
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/12
Retrieved url: https://www.dhakatribune.com/hashtag/road-accident/page/13
Retrieved url: https://www.dhakatribune.com/hashtag/ro

In [11]:
# let's create a dataframe with all the info in hand

column_names = ['date_of_incident', 'time_of_incident', 'incident_type', 'location',
       'death_count', 'injury_count', 'type_of_vehicle1', 'type_of_vehicle2',
       'driver_age', 'description_text', 'published-time', 'link', 'full_text']

df = pd.DataFrame(columns = column_names)
df['description_text'] = headings
df['published-time'] = times
df['link'] = outputs
df.head()

Unnamed: 0,date_of_incident,time_of_incident,incident_type,location,death_count,injury_count,type_of_vehicle1,type_of_vehicle2,driver_age,description_text,published-time,link,full_text
0,,,,,,,,,,2 killed in Dhaka road crash,"Sat, Jun 26 2021",https://www.dhakatribune.com/accident/2021/06/...,
1,,,,,,,,,,Three killed in Muktagacha road...,"Wed, Jun 23 2021",https://www.dhakatribune.com/accident/2021/06/...,
2,,,,,,,,,,Truck-pickup van collision leaves 3...,"Sat, Jun 19 2021",https://www.dhakatribune.com/bangladesh/nation...,
3,,,,,,,,,,Bogra road crash kills 3,"Sat, Jun 19 2021",https://www.dhakatribune.com/bangladesh/nation...,
4,,,,,,,,,,Cop killed in Dhaka road crash,"Sat, Jun 19 2021",https://www.dhakatribune.com/bangladesh/dhaka/...,


In [12]:
# Let's populate the full_text column as well

In [13]:
%%time

links = df.link
full_text = []

for link in links:

        URL = link
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')

        cont = container = soup.find("div", attrs={"class": "report-content fr-view"})

        parahraphs = cont.find_all('p')
        para = []
        
        for i in list(range(len(parahraphs))):
            para.append(cont.find_all('p')[i].get_text())
            s = ' '.join(para)
        full_text.append(s)
        
df['full_text'] = full_text
        
display(df.tail())
df.to_csv('dhakatribune_1jan2016-29jun2021_v2.csv', index = False)

Unnamed: 0,date_of_incident,time_of_incident,incident_type,location,death_count,injury_count,type_of_vehicle1,type_of_vehicle2,driver_age,description_text,published-time,link,full_text
982,,,,,,,,,,6 killed in Pabna road accident,"Mon, Aug 15 2016",https://www.dhakatribune.com/bangladesh/2016/0...,The vice-president of Jatiyatabadi Chhatra Dal...
983,,,,,,,,,,Road accident kills 1 in Natore,"Fri, Aug 12 2016",https://www.dhakatribune.com/bangladesh/2016/0...,The vice-president of Jatiyatabadi Chhatra Dal...
984,,,,,,,,,,Traffic constable dies after truck runs...,"Thu, Aug 4 2016",https://www.dhakatribune.com/bangladesh/2016/0...,The vice-president of Jatiyatabadi Chhatra Dal...
985,,,,,,,,,,Traffic constable crushed under truck...,"Thu, Aug 4 2016",https://www.dhakatribune.com/bangladesh/2016/0...,The vice-president of Jatiyatabadi Chhatra Dal...
986,,,,,,,,,,Two motorcyclists killed in Manikganj...,"Sat, Jul 16 2016",https://www.dhakatribune.com/bangladesh/2016/0...,The vice-president of Jatiyatabadi Chhatra Dal...


Wall time: 2min 37s


Well, there are some errors and we gotta fix this! The texts extracted from links starting from index 736 are repetitive. There is a slight change in the HTML which causes this. The texts are in text format instead of being in paragraphs for these indices.  

In [14]:
%%time 

links_left = list(df.link[736:])
full_text_left = []

for link in links_left:

        URL = link
        page = requests.get(URL)
        soup = BeautifulSoup(page.content, 'html.parser')

        cont = container = soup.find("div", attrs={"class": "report-content fr-view"})
        parahraphs = cont.text.strip().replace('\n', ' ')
        
        full_text_left.append(parahraphs)
        
df.full_text[736:] = full_text_left

display(df.tail())
df.to_csv('dhakatribune_1jan2016-29jun2021_v2.csv', index = False)

Unnamed: 0,date_of_incident,time_of_incident,incident_type,location,death_count,injury_count,type_of_vehicle1,type_of_vehicle2,driver_age,description_text,published-time,link,full_text
982,,,,,,,,,,6 killed in Pabna road accident,"Mon, Aug 15 2016",https://www.dhakatribune.com/bangladesh/2016/0...,"Of the deceased, five were identified as Abul ..."
983,,,,,,,,,,Road accident kills 1 in Natore,"Fri, Aug 12 2016",https://www.dhakatribune.com/bangladesh/2016/0...,"The deceased was identified as Ali Hossain, 50..."
984,,,,,,,,,,Traffic constable dies after truck runs...,"Thu, Aug 4 2016",https://www.dhakatribune.com/bangladesh/2016/0...,The constable was identified as RG Jojon Akhan...
985,,,,,,,,,,Traffic constable crushed under truck...,"Thu, Aug 4 2016",https://www.dhakatribune.com/bangladesh/2016/0...,An on-duty traffic constable has been crushed ...
986,,,,,,,,,,Two motorcyclists killed in Manikganj...,"Sat, Jul 16 2016",https://www.dhakatribune.com/bangladesh/2016/0...,At least two motorcyclists have been killed as...


Wall time: 40.2 s


### Reference

* https://github.com/subrockmann/news_scraping/blob/main/Scraping_road_accidents.ipynb