In [1]:
"""
requests_html_tutorial.py
~~~~~~~~~~~~~~~~~~~~~~~~~

This module demonstrates the usage of the `requests-html` library in Python for web scraping.
It covers various real-world examples including rendering JavaScript, extracting data, and
working with HTML elements. This script is designed as an educational tool for understanding
web scraping using Python.

Author: [Your Name]
Version: 1.0
"""



import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from database_connection import create_db_connection
from requests_html import HTMLSession
import pandas as pd 
import numpy as np

from news_insert import (execute_query,
                        insert_reporter, 
                        insert_category, 
                        insert_news,
                        insert_publisher,
                        insert_image,
                        insert_summary
                        )

police_dict = {}
police_dict['category'] = []
police_dict['title'] = []
police_dict['date_and_time'] = []
police_dict['reporter'] = []
police_dict['body'] = []
police_dict['image_link'] = []
police_dict['page_link'] = []

def process_and_insert_news_data(connection,category, title, body, image_link, page_link, author,
       date_and_time, category_description, reporter_mail, publisher_name,
       publisher_email, publisher_phone, head_office_address, website ):
    
    try:
        # Insert category if not exists
        category_id = insert_category(connection, category, category_description)
        
        # Insert reporter if not exists
        reporter_id = insert_reporter(connection, author, reporter_mail)
        
        # Insert publisher as a placeholder (assuming publisher is not provided)
        publisher_id = insert_publisher(connection, publisher_name, publisher_email,publisher_phone,head_office_address,website,
                                        "facebook.com/bdnews24" , "twitter.com/bdnews24" , 
                                        "linkedin.com/bdnews24" , "instagram.com/bdnews24")
        
        # Insert news article
        news_id = insert_news(connection, category_id, reporter_id, publisher_id, date_and_time, title, body, page_link)
        
        # Insert images
        image_id = insert_image(connection, news_id, image_link)
    
    except Error as e:
        print(f"Error while processing news data - {e}")

def render_javascript(url):
    """
    Demonstrates how to render JavaScript using the `requests-html` library.
    This function fetches the page content after JavaScript has been executed.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:
        response = session.get(url)
        # response.html.render()  # This will download Chromium if not found
        print("Rendered web page:", response.html.html)
    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_title_link(url):
    session = HTMLSession()
    try:
        link_lists = []
        response = session.get(url)
        title_link_lead = response.html.find("div.Cat-lead-wrapper > a")
        print(title_link_lead[0].attrs['href'])
        link_lists.append(title_link_lead[0].attrs['href'])

        second_link = response.html.find("div.rm-container > a")
        # print(second_link[0].attrs['href'])
        # print(second_link[1].attrs['href'])
        

        for i in range (len(second_link)):
            print(second_link[i].attrs['href'])
            link_lists.append(second_link[i].attrs['href'])


        
        return link_lists


    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def extract_information(url):
    """
    Extracts and prints specific information from a webpage using CSS selectors.

    Parameters:
    url : str
        The URL of the website to scrape.

    Returns:
    None
    """
    session = HTMLSession()
    try:



        response = session.get(url)
        police_dict['page_link'].append(url)
        print(f"news link",url)

        # Example: Extracting category
        category = response.html.find("ul.ignore-print > li > a")
        print(f"Category : {category[1].text}\n")
        police_dict['category'].append(category[1].text)


        #Extracting Title
        title = response.html.find("div.d-flex > h1")
        print(f"Title : {title[0].text}\n")
        police_dict['title'].append(title[0].text)

        # reporter
        reporter = response.html.find(".author")
        print(f"Reporter : {reporter[0].text}\n")
        police_dict['reporter'].append(reporter[0].text)

        #Extracting Time 
        date_and_time = response.html.find("div.pub-up > p > span")
        
        print(f"Date & Time : {date_and_time[1].text}\n")
        police_dict['date_and_time'].append(date_and_time[1].text)
        # print(len(times))

        #Extracting body
        news_body = response.html.find("div.details-brief")
        body = ""
        print("Body : ")
        for i in news_body:
            print(i.text)
            body += i.text
        police_dict['body'].append(body)
        print("\n")

        #extracting Image source
        img = response.html.find("div.details-img > picture > img.img-fluid")
        print("Image :", img[0].attrs['src'])
        police_dict['image_link'].append(img[0].attrs['src'])

        
        

    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

def main():
    """
    Main function to execute the web scraping examples.
    """

    pass



if __name__ == "__main__":
    conn = create_db_connection()
    link_lists = extract_title_link("https://bdnews24.com/bangladesh/")
    for i in link_lists:
        extract_information(i)
    df = pd.DataFrame.from_dict(police_dict)
    # Data preprocessing
    df['category_description'] = "All news of " + df['category']
    df['reporter_mail'] = df['reporter']+"@hotmail.com"
    df['publisher_name'] = "BD NEWS 24"
    df['publisher_email'] = "editor@bdnews24.com"
    df['publisher_phone'] = "+8801857678105"
    df['head_office_address'] = "17 Mohakhali C/A Red Crescent Concord Tower, 17th Floor Dhaka-1212, Bangladesh"
    df['website'] = "https://bdnews24.com/"

    #insert into database
    lenth = df.shape[0]
    for i in range (lenth):
        process_and_insert_news_data(conn,df['category'][i], df['title'][i],
                                  df['body'][i], df['image_link'][i], df['page_link'][i],df['reporter'][i],df['date_and_time'][i], 
                                  df['category_description'][i], df['reporter_mail'][i], df['publisher_name'][i],df['publisher_email'][i], 
                                  df['publisher_phone'][i], df['head_office_address'][i], df['website'][i])



MySQL Database connection successful
https://bdnews24.com/bangladesh/7ad8f731b41f
https://bdnews24.com/bangladesh/632a8aa248d7
https://bdnews24.com/bangladesh/b82b25a49e16
https://bdnews24.com/bangladesh/38ea49461a08
https://bdnews24.com/bangladesh/9d7b746486e3
https://bdnews24.com/bangladesh/36c058243d17
https://bdnews24.com/bangladesh/578802c40a3f
https://bdnews24.com/bangladesh/53b0be060efd
https://bdnews24.com/bangladesh/5cd8db42b186
news link https://bdnews24.com/bangladesh/7ad8f731b41f
Category : Bangladesh

Title : Indian defence minister tells military to watch out for conflicts in Bangladesh, China

Reporter : News Desk

Date & Time : 06 Sep 2024, 11:48 PM

Body : 
India’s Defence Minister Rajnath Singh has urged the country’s armed forces to analyse the ongoing conflicts between Israel-Hamas, Russia-Ukraine, and the current state of affairs in Bangladesh to prepare India for the “unexpected”.
He made the remarks on Thursday during a joint commanders’ conference in India’s Luc

In [11]:
from requests_html import HTMLSession
session  = HTMLSession()
response = session.get('https://bdnews24.com/bangladesh/53b0be060efd')
 # Example: Extracting category
category = response.html.find("ul.ignore-print > li > a")
# for link in category:
#     print(f"Category : {link.text} ")
# print(f"Category : {category[2].text}\n")
print(category[1].text)
title = response.html.find("div.d-flex > h1")
# print(f"Title : {title[0].text}\n")
print(title[0].text)
date_and_time = response.html.find("div.pub-up > p > span")
print(date_and_time[1].text)

body = response.html.find("div.details-brief")
print("Body : ")
for i in body:
    print(i.text)
print("\n")
img = response.html.find("div.details-img > picture > img.img-fluid")
print(img[0].attrs['src'])

reporter = response.html.find(".author")
print(reporter[0].text)

Bangladesh
Battery-run rickshaws cause traffic chaos on return to Dhaka’s main roads
06 Sep 2024, 01:37 AM
Body : 
Battery-powered rickshaws and easy bikes, banned from Dhaka’s main roads, have defied restrictions over the past few days.
Since Aug 5, these vehicles have been dominating main roads, contributing to traffic disarray with their erratic movements and use of wrong lanes.
Drivers and passengers of other vehicles, as well as police officers, have reported increasing concerns about the disorder caused by these three-wheeled vehicles.
During a recent visit to areas like Mirpur, Dhanmondi, Shyamoli, Mohakhali, and Badda, numerous battery-operated rickshaws and easy bikes were operating on every major road.
These rickshaws often cut in front of faster vehicles and navigate turns unpredictably, causing confusion and increasing risks on the roads.
Manjur Hossain, a battery-operated rickshaw-puller from Agargaon, said: “We would earlier avoid main roads during the day, but now with n

In [18]:
print(len(police_dict['category']))
print(len(police_dict['date_and_time']))
print(len(police_dict['title']))
print(len(police_dict['body']))
print(len(police_dict['reporter']))
print(len(police_dict['image_link']))
print(len(police_dict['page_link']))


9
8
9
8
8
8
9


In [56]:
link_lists

['https://bdnews24.com/bangladesh/632a8aa248d7',
 'https://bdnews24.com/bangladesh/578802c40a3f',
 'https://bdnews24.com/bangladesh/53b0be060efd',
 'https://bdnews24.com/bangladesh/5cd8db42b186',
 'https://bdnews24.com/bangladesh/e09b5b4fcc01',
 'https://bdnews24.com/bangladesh/33b6e8ab3111',
 'https://bdnews24.com/bangladesh/c2fa423723a2',
 'https://bdnews24.com/bangladesh/96827d3726a3',
 'https://bdnews24.com/bangladesh/0b17db78ab86',
 'https://bdnews24.com/bangladesh/433e76dbe3e7',
 'https://bdnews24.com/bangladesh/36c058243d17',
 'https://bdnews24.com/bangladesh/578802c40a3f',
 'https://bdnews24.com/bangladesh/53b0be060efd',
 'https://bdnews24.com/bangladesh/5cd8db42b186',
 'https://bdnews24.com/bangladesh/e09b5b4fcc01',
 'https://bdnews24.com/bangladesh/33b6e8ab3111',
 'https://bdnews24.com/bangladesh/c2fa423723a2',
 'https://bdnews24.com/bangladesh/96827d3726a3']

In [None]:
import os
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from database_connection import create_db_connection
from requests_html import HTMLSession
import pandas as pd 
import numpy as np

from news_insert import (execute_query,
                        insert_reporter, 
                        insert_category, 
                        insert_news,
                        insert_publisher,
                        insert_image,
                        insert_summary
                        )
def process_and_insert_news_data(connection,category, title, body, image_link, page_link, author,
       time_date, category_description, reporter_mail, publisher_name,
       publisher_email, publisher_phone, head_office_address, website ):
    
    try:
        # Insert category if not exists
        category_id = insert_category(connection, category, category_description)
        
        # Insert reporter if not exists
        reporter_id = insert_reporter(connection, author, reporter_mail)
        
        # Insert publisher as a placeholder (assuming publisher is not provided)
        publisher_id = insert_publisher(connection, publisher_name, publisher_email,publisher_phone,head_office_address,website,
                                        "facebook.com/bdnews24" , "twitter.com/bdnews24" , 
                                        "linkedin.com/bdnews24" , "instagram.com/bdnews24")
        
        # Insert news article
        news_id = insert_news(connection, category_id, reporter_id, publisher_id, time_date, title, body, page_link)
        
        # Insert images
        image_id = insert_image(connection, news_id, image_link)
    
    except Error as e:
        print(f"Error while processing news data - {e}")

In [4]:
# import pandas as pd
police_dict

{'category': ['', '', '', '', '', '', '', '', ''],
 'title': ['Bangladesh wants to resolve Teesta water-sharing issues while following international rules: Yunus',
  '‘Fascist’ Awami League’s political future depends on the people, says Advisor Asif Mahmud',
  "Ganabhaban to be turned into ‘July Revolution Memorial Museum'",
  'Battery-run rickshaws cause traffic chaos on return to Dhaka’s main roads',
  'Awal commission’s exit leaves EC vacant for first time in over a decade',
  '‘Shahidi March’: pledge to build a Bangladesh free of discrimination',
  'Bangladesh asks India to conduct inquiries into all border killings',
  "Bangladeshi students rally to mark one month since Hasina's fall",
  'One month of mass uprising: Yunus’s message to nation'],
 'date_and_time': ['06 Sep 2024, 04:38 PM',
  '06 Sep 2024, 02:54 AM',
  '06 Sep 2024, 02:44 AM',
  '06 Sep 2024, 01:37 AM',
  '06 Sep 2024, 01:32 AM',
  '05 Sep 2024, 11:45 PM',
  '05 Sep 2024, 11:14 PM',
  '05 Sep 2024, 09:10 PM'],
 'repo

In [48]:
from requests_html import HTMLSession
session  = HTMLSession()
url = "https://bdnews24.com/bangladesh/"
link_lists = []
def extract_title_link(url): #use only once
    session = HTMLSession()
    try:
        # link_lists = []
        response = session.get(url)
        title_link_lead = response.html.find("div.Cat-lead-wrapper > a")
        print(title_link_lead[0].attrs['href'])
        link_lists.append(title_link_lead[0].attrs['href'])

        second_link = response.html.find("div.rm-container > a")
        # print(second_link[0].attrs['href'])
        # print(second_link[1].attrs['href'])
        

        for i in range (len(second_link)):
            print(second_link[i].attrs['href'])
            link_lists.append(second_link[i].attrs['href'])

       
        
        return link_lists


    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        session.close()

In [49]:
extract_title_link("https://bdnews24.com/bangladesh/")

https://bdnews24.com/bangladesh/632a8aa248d7
https://bdnews24.com/bangladesh/578802c40a3f
https://bdnews24.com/bangladesh/53b0be060efd
https://bdnews24.com/bangladesh/5cd8db42b186
https://bdnews24.com/bangladesh/e09b5b4fcc01
https://bdnews24.com/bangladesh/33b6e8ab3111
https://bdnews24.com/bangladesh/c2fa423723a2
https://bdnews24.com/bangladesh/96827d3726a3
https://bdnews24.com/bangladesh/0b17db78ab86


['https://bdnews24.com/bangladesh/632a8aa248d7',
 'https://bdnews24.com/bangladesh/578802c40a3f',
 'https://bdnews24.com/bangladesh/53b0be060efd',
 'https://bdnews24.com/bangladesh/5cd8db42b186',
 'https://bdnews24.com/bangladesh/e09b5b4fcc01',
 'https://bdnews24.com/bangladesh/33b6e8ab3111',
 'https://bdnews24.com/bangladesh/c2fa423723a2',
 'https://bdnews24.com/bangladesh/96827d3726a3',
 'https://bdnews24.com/bangladesh/0b17db78ab86']

In [50]:
link_lists

['https://bdnews24.com/bangladesh/632a8aa248d7',
 'https://bdnews24.com/bangladesh/578802c40a3f',
 'https://bdnews24.com/bangladesh/53b0be060efd',
 'https://bdnews24.com/bangladesh/5cd8db42b186',
 'https://bdnews24.com/bangladesh/e09b5b4fcc01',
 'https://bdnews24.com/bangladesh/33b6e8ab3111',
 'https://bdnews24.com/bangladesh/c2fa423723a2',
 'https://bdnews24.com/bangladesh/96827d3726a3',
 'https://bdnews24.com/bangladesh/0b17db78ab86']

In [None]:
police_dict = {}
police_dict['category'] = []
police_dict['title'] = []
police_dict['date_time'] = []
police_dict['reporter'] = []
police_dict['body'] = []
police_dict['image_link'] = []
police_dict['page_link'] = []

In [6]:
pip install -r requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Collecting pymysql (from -r requirements.txt (line 10))
  Using cached PyMySQL-1.1.1-py3-none-any.whl.metadata (4.4 kB)
Using cached PyMySQL-1.1.1-py3-none-any.whl (44 kB)
Installing collected packages: pymysql
Successfully installed pymysql-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install pandas


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [11]:
category = response.html.find("ul.ignore-print > li > a")
print(f"Category : {category[0].text}\n")

NameError: name 'response' is not defined

In [1]:
from requests_html import HTMLSession
session  = HTMLSession()
response = session.get('https://bdnews24.com/bangladesh/5cd8db42b186')
 # Example: Extracting category
category = response.html.find("ul.ignore-print > li > a")
# for link in category:
#     print(f"Category : {link.text} ")
# print(f"Category : {category[2].text}\n")
print(category[1].text)
title = response.html.find("div.d-flex > h1")
# print(f"Title : {title[0].text}\n")
print(title[0].text)
date_and_time = response.html.find("div.pub-up > p > span")
print(date_and_time[1].text)

body = response.html.find("div.details-brief")
print("Body : ")
for i in body:
    print(i.text)
print("\n")
img = response.html.find("div.details-img > picture > img.img-fluid")
print(img[0].attrs['src'])

reporter = response.html.find("div.author-name-wrap > p > span.author")
print(reporter[0].text)

Bangladesh
Awal commission’s exit leaves EC vacant for first time in over a decade
06 Sep 2024, 01:32 AM
Body : 
The Election Commission has fallen vacant once again following the resignation of Kazi Habibul Awal and his team appointed during Sheikh Hasina's tenure.
The incumbent commissioners stood down on Thursday, just a month after the Hasina government was brought down by pro-democracy demonstrators, leaving the posts empty until the establishment of a new commission.
The elections regulator was halfway through its five-year term.
This is not the first time the EC has been left vacant, though.
In 2007, under Justice MA Aziz, the six-strong commission was vacant for a week after all members resigned.
Justice Aziz resigned a day before the highly controversial Jan 22 elections, with senior election commissioner Justice Mahfuzur Rahman taking over as acting chief election commissioner.
Later, all five members resigned on Jan 31.
Subsequently, a new commission led by ATM Shamsul Huda 

In [13]:
from requests_html import HTMLSession
session  = HTMLSession()
response = session.get('https://bdnews24.com/bangladesh/433e76dbe3e7')
 # Example: Extracting category
category = response.html.find("ul.ignore-print > li > a")
# for link in category:
#     print(f"Category : {link.text} ")
# print(f"Category : {category[2].text}\n")
print(category[1].text)
title = response.html.find("div.d-flex > h1")
# print(f"Title : {title[0].text}\n")
print(title[0].text)
date_and_time = response.html.find("div.pub-up > p > span")
print(date_and_time[1].text)

body = response.html.find("div.details-brief")
print("Body : ")
for i in body:
    print(i.text)
print("\n")
img = response.html.find("div.details-img > picture > img.img-fluid")
print(img[0].attrs['src'])

reporter = response.html.find("div.author-name-wrap > p > span.author")
print(reporter[0].text)

Bangladesh
Bangladesh wants to resolve Teesta water-sharing issues while following international rules: Yunus
06 Sep 2024, 04:38 PM
Body : 
Bangladesh’s interim government will ‘work with’ India in order to resolve the issues over the long-pending Teesta water-sharing treaty, says Chief Advisor Muhammad Yunus.
The issue has been left hanging for an extended period, which is not good for either country, he told Indian news organisation PTI in an interview.
Yunus spoke with PTI from his official residence in Dhaka. Many Indian outlets published reports based on the interview on Friday.
The chief advisor urged the issue to be resolved according to international norms.
“By sitting over this issue [water sharing], it is not serving any purpose. If I know how much water I will get, even if I am not happy and sign it, it would be better. This issue has to be resolved,” he said.
Asked whether the interim government would push India on the issue, he said:
“Push is a big word; I am not saying it