### Scrapping todays & yesturdays article on multiple pages of the Champion News website by searching for the  country-name(or any keyword).

In [1]:
# importing modules
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import date, timedelta, datetime
import time
import re

In [2]:
# Set up the web driver (provide the path to your Firefox web driver)
chrome_options = webdriver.FirefoxOptions()

chrome_options.add_argument("--window-size=1920x1080")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--verbose")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-software-rasterizer")

driver = webdriver.Firefox(options=chrome_options)

driver.get("https://championnews.com.ng")

In [3]:
# Clicking the search icon
search_icon = driver.find_element(By.XPATH, "//span[@class='search-handler']")
search_icon.click()
# Get the input elements
search_input = driver.find_element(By.XPATH, "//input[@class='search-field']")
search_button = driver.find_element(By.XPATH, "//input[@value='Search']")

# Send the input to the webpage
search_input.send_keys('nigeria')
search_input.send_keys(Keys.RETURN)

In [4]:
# Define today's date and yesterday's date

today = date.today()
yesterday = today - timedelta(days=1)

def post_date_check(post_date):
    date_match = re.search(r'(\w{3})\s(\d{1,2}),\s(\d{4})', post_date)
    
    if date_match:
        month, day, year = date_match.groups()
        month_map = {
            'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
            'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
        }
        month_number = month_map.get(month)
        article_date = datetime(int(year), month_number, int(day)).date()

        return article_date == today or article_date == yesterday

    return False


In [6]:
print(post_date_check('Oct 29, 2023'))
print(post_date_check('Oct 27, 2023'))

True
False


In [7]:
# list for storing the articles url
todays_and_yesturdays_article_urls = []
# list for storing the articles post-date becouse post-date not available in single article page
articles_postdate = []

# Scrapping articles with pagination
new_articles = True

# Loop through the pages
while new_articles:
    
    # Scrolling down to load more search results
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
    
    # collecting all articles from the current page
    article_list = driver.find_elements(By.XPATH, "//div//article")
    
    # Extract the URLs of today's and yesterday's articles 
    for article in article_list:
        # Extract the date
        date_text = article.find_element(By.CLASS_NAME, "time").text
        date_match = post_date_check(date_text)

        if date_match:
            a_link = article.find_element(By.TAG_NAME, "h2").find_element(By.TAG_NAME, "a").get_attribute("href")
            todays_and_yesturdays_article_urls.append(a_link)
            articles_postdate.append(date_text)
        else:
            new_articles = False
            # break
        
    # Find and click the pagination link for the next page
    try:
        next_page_link = driver.find_element(By.XPATH, "//div[@class='older']//a")
        next_page_link.click()
    except NoSuchElementException:
        print("No 'Next Page' link found. Exiting the loop.")
        break  # Exit the loop if there is no "Next Page" link

In [8]:
len(todays_and_yesturdays_article_urls)

4

In [9]:
len(articles_postdate)

4

In [15]:
# Set pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

# Initialize an empty list to store dictionaries
data_list = []

for article_url in todays_and_yesturdays_article_urls:
    driver.get(article_url)
    # Wait for the article page to load
    wait = WebDriverWait(driver, 10)  # Adjust the timeout as needed
    article_element = wait.until(EC.presence_of_element_located((By.XPATH, "//h1[@class='single-post-title']")))
    
    # collecting the data
    headline = driver.find_element(By.XPATH, "//h1[@class='single-post-title']").text
    p_content = driver.find_elements(By.XPATH, "//div[@class='continue-reading-content close']//p")
    content = ''
    for i in p_content:
        content += i.text
        
    content = content.replace("    ", " ")
    
    # Append the data to the list as a dictionary
    data_list.append({"Headline": headline, "Content": content})
    

In [16]:
# Create the DataFrame by concatenating the list of dictionaries
df = pd.DataFrame(data_list)

In [17]:
df['Post_date'] = articles_postdate

In [18]:
len(df)

4

In [19]:
df.sample(1)

Unnamed: 0,Headline,Content,Post_date
1,Idris urges Nigerians to unite for the Renewed Hope Agenda as the Supreme Court validates Presidential Election Result,"CHIDIMMA UCHEGBU – Abuja The Minister of Information and National Orientation, Alhaji Mohammed Idris has urged Nigerians to unite and support the Renewed Hope Agenda of President Bola Ahmed Tinubu as the Supreme Court validates presidential election results. The minister stated this at a press conference in Abuja in the aftermath of the ruling by the Supreme Court of Nigeria on the outcome of the 2023 presidential election. The event was held on October 27, 2023. The Hon. Minister said that the legal contest and distraction regarding the outcome of the Presidential election is over, and it is time for all Nigerians to come together and move the Nation forward, adding that since the President assumed office on May 29, he has worked tirelessly to actualize the Renewed Hope Agenda by implementing unprecedented reforms that seem tough and painful in the short term but are necessary foundations for the economic growth and prosperity that Nigerians deserve. Alhaji Idris said the government is aware of the present challenges caused by the removal of petrol subsidies and the ongoing liberalization of the foreign exchange regime and is taking urgent and deliberate steps to alleviate the pain and cushion the effects on Nigerians. He noted that the removal of petrol subsidies is freeing up significant resources for Federal and State Governments to invest in infrastructure and welfare programs for the benefit of citizens. He said that the ongoing liberalization of the foreign exchange regime is blocking loopholes and gaps that have been exploited in very evil ways over the years, leading to the loss of billions of dollars. The Minister highlighted over ten key programmes undertaken by the present administration, which include the provisional wage increment of N35,000 monthly for six months to enhance the Federal minimum wage, the establishment of an Infrastructure Support Fund, and the Launch of a 100 Billion Naira CNG bus rollout programme, payment of Cash Transfer of N25,000 monthly to the poorest and most vulnerable Nigerians for 3 months, the release of 200,000 metric tonnes of grains from strategic reserves to households across the 36 states and FCT to moderate prices, and 225,000 metric tonnes of fertilizer, seedlings, and other inputs to farmers. Others include the five Executive Orders, aimed at improving Nigeria’s business and fiscal environment and increasing foreign exchange supply, the establishment of a Presidential Fiscal Policy and Tax Reform Committee to reform the tax system and reduce the tax burden and complications on Nigerians, An access-to-credit programme for start-ups and MSMEs: providing N50 billion in Conditional Grants to 1 million Nano-businesses across Nigeria between now and March 2024; and a new single-digit interest-rate Fund to provide N75 billion to support manufacturing enterprises, among others. The Minister also added that the government had launched the 3MTT programme to develop 3 million technical talents by 2025, in line with the President’s vision for making Nigeria a global hub for digital jobs. He said the government has also launched the National Talent Export Programme (NATEP), to create one million service-export jobs over the next 5 years, and make Nigeria a global business outsourcing hub, while the new student loan program is expected to kick off in January 2024. The Minister assured that the Federal Government will not abdicate its responsibilities but noted that State and Local Governments have significant roles to play. He recalled that during a recent Executive Council of the Federation (ECF) meeting, several financing packages were approved to support States’ delivery of various programmes, including access to education for adolescent girls. The Minister assured that the government will continue to strive to rebuild the trust of citizens in the ways and workings of the Government and boost the credibility of public information and communication. The Minister was accompanied in the Press Briefing by some of the newly appointed Directors-General of Agencies and Parastatals under the Ministry. For a better society—————————————————————–Kindly follow us across all our social media platforms to stay up-to-date with the latest news and happenings in Nigeria and across the globe.Facebook – https://facebook.com/championnewsonlineInstagram – https://instagram.com/championnewsonlineTwitter– @championnewsng","Oct 29, 2023"


In [None]:
# storing data in to excel file
df.to_excel('championnews.xlsx', index=False)

In [20]:
driver.quit()