# IDEAS PROJECT SCRAPER

This notebook contains the code for the scraper bot that crawls the WorldBank Ideas Project website and extracts data.
To ensure ethical scraping, only the pages allowed for scraping in the website's sitemap are crawled for data.
This sitemap is found in the [robots.txt](https://ideasproject.gov.ng) file.

Make sure to cd into the folder you want your data set to be stored, for ease of retrieval

In [None]:
!pip install selenium
!pip install selenium_stealth
!pip install selenium webdriver-manager


Collecting selenium
  Downloading selenium-4.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.28.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.28.1-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.28.0-py3-none-any.whl (486 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.3/486.3 kB[0m [31m21.7 MB/s

In [None]:
# import libraries
from selenium import webdriver
from selenium_stealth import stealth
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

import pandas as pd
import numpy as np
import re
import time
import random
import os

In [None]:
# Initialize Selenium WebDriver with options
def initialize_driver_options():
    # create a ChromeOptions object
    options = Options()

    #run in headless mode
    options.add_argument("--headless")

    # disable the AutomationControlled feature of Blink rendering engine
    options.add_argument('--disable-blink-features=AutomationControlled')

    # disable pop-up blocking
    options.add_argument('--disable-popup-blocking')

    # start the browser window in maximized mode
    options.add_argument('--start-maximized')

    # disable extensions
    options.add_argument('--disable-extensions')

    # disable sandbox mode
    options.add_argument('--no-sandbox')

    # disable shared memory usage
    options.add_argument('--disable-dev-shm-usage')


    return options

In [None]:
# user agents list
user_agent_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15',
                   'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
                   'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
                  ]

In [118]:
# Function to save page content
def save_page_content(url, content):
    # Extract filename from URL and save content
    filename = url.split("/")[-1] + ".txt"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename

In [None]:
def get_links(sitemap):

    # initialise driver options
    options = initialize_driver_options()

    # request for a random and different user agent everytime
    user_agent = random.choice(user_agent_list)

    options.add_argument(f'user-agent={user_agent}')

    #initialise the fortified scraper
    driver = webdriver.Chrome(options=options)

    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})

    #enable stealth mode
    stealth(driver,
    languages=["en-US", "en"],
    vendor="Google Inc.",
    platform="Win32",
    webgl_vendor="Intel Inc.",
    renderer="Intel Iris OpenGL Engine",
    fix_hairline=True,
      )


    driver.get(sitemap)
    time.sleep(2)

    #fetch all links in the sitemap table of urls
    table1 = driver.find_elements(By.ID, 'sitemap__table')
    links = []
    time.sleep(2)
    i = 1
    site_map_list = []
    #loop through each row in the table list
    for row in table1:
        time.sleep(2)
        print("in row", i)
        for rows in row.find_elements(By.TAG_NAME, 'tr'):
          for cell in rows.find_elements(By.TAG_NAME, 'td'):
            new_page = cell.find_elements(By.TAG_NAME, "a")

            for page in new_page:
                site = page.get_attribute('href')
                site_map_list.append(site)



    driver.quit()
    return site_map_list

In [None]:
sitemap = "https://ideasproject.gov.ng/wp-sitemap.xml"
site_map_links = get_links(sitemap)

in row 1


In [None]:
len(site_map_links)

11

In [None]:
links = []

In [None]:
# get main links in batches
for site in site_map_links[5:]:
  link = get_links(site)
  links.append(link)
links

in row 1
in row 1
in row 1
in row 1
in row 1
in row 1


[['https://ideasproject.gov.ng/ideas-project-world-bank-mission-visit-edo-state/',
  'https://ideasproject.gov.ng/capacity-building-for-procurement-and-financial-management-project-staff/',
  'https://ideasproject.gov.ng/human-resource-strategy-committee-workshop/',
  'https://ideasproject.gov.ng/communication-officers-and-innovation-grant-managers-training-workshop-on-ideas-project/',
  'https://ideasproject.gov.ng/innovation-grant-call-for-proposal/',
  'https://ideasproject.gov.ng/igf-press-release/',
  'https://ideasproject.gov.ng/human-resource-strategy-for-skills-development-implementation/',
  'https://ideasproject.gov.ng/innovation-grant-call-for-proposal-2/',
  'https://ideasproject.gov.ng/general-procurement-notice/',
  'https://ideasproject.gov.ng/kano-state-ideas-project-launch/',
  'https://ideasproject.gov.ng/validation-workshop-for-the-review-of-10-ntc-antc-curricular/',
  'https://ideasproject.gov.ng/kano-spiu-innovation-grant-call-for-proposal/',
  'https://ideasprojec

In [None]:
def visit_page(link):

    # initialise driver options
    options = initialize_driver_options()

    # request for a random and different user agent everytime
    user_agent = random.choice(user_agent_list)

    options.add_argument(f'user-agent={user_agent}')

    #initialise the fortified scraper
    driver = webdriver.Chrome(options=options)

    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})

    #enable stealth mode
    stealth(driver,
    languages=["en-US", "en"],
    vendor="Google Inc.",
    platform="Win32",
    webgl_vendor="Intel Inc.",
    renderer="Intel Iris OpenGL Engine",
    fix_hairline=True,
      )


    driver.get(link)
    time.sleep(2)
    content = ""
    words = driver.find_elements(By.CLASS_NAME, "wpb_wrapper")
    for t in words:
        content += t.text
    save_page_content(link, content)
    driver.quit()

In [None]:
# visit each page one after the other and save their content
for link in links[10]:
  visit_page(link)


In [None]:
# find number of links scraped == number of documents created
sum = 0
for link in links:
  sum += (len(link))
sum

135

In [None]:
# remove empty files
source = "/content/sample_data"
for file in os.listdir(source):
    if file.endswith("txt"):
      path = f"{source}/{file}"
      file_size = os.path.getsize(path)
        # if file size is 0, it is empty
      if file_size == 0:
            os.remove(f'{source}/{file}')

In [None]:
# number of files left after removing empty ones
files = os.listdir(source)
print(len(files))

80


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [122]:
%cd /content/drive/MyDrive/Data

/content/drive/MyDrive/Data


In [None]:
!zip -r scraped_data.zip scraped_data/

  adding: scraped_data/ (stored 0%)
  adding: scraped_data/capacity-building-for-procurement-and-financial-management-project-staff.txt (deflated 56%)
  adding: scraped_data/communication-officers-and-innovation-grant-managers-training-workshop-on-ideas-project.txt (deflated 76%)
  adding: scraped_data/carousel-portfolio-item-layout.txt (deflated 21%)
  adding: scraped_data/essentials-wordpress-theme.txt (stored 0%)
  adding: scraped_data/ideas-project-lunch-in-edo-state.txt (deflated 59%)
  adding: scraped_data/coming-soon.txt (deflated 21%)
  adding: scraped_data/gombe-state-piu.txt (deflated 34%)
  adding: scraped_data/abia-state-piu.txt (deflated 34%)
  adding: scraped_data/services.txt (deflated 71%)
  adding: scraped_data/contact-us-simple.txt (deflated 33%)
  adding: scraped_data/about-nyesaf.txt (stored 0%)
  adding: scraped_data/latest-events.txt (deflated 23%)
  adding: scraped_data/benue-state-piu.txt (deflated 35%)
  adding: scraped_data/beautiful-and-modern-portfolio.txt (

In [None]:
from google.colab import files
files.download('scraped_data.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Uniccon Scraper

In [None]:
def get_links(website):

    # initialise driver options
    options = initialize_driver_options()

    # request for a random and different user agent everytime
    user_agent = random.choice(user_agent_list)

    options.add_argument(f'user-agent={user_agent}')

    #initialise the fortified scraper
    driver = webdriver.Chrome(options=options)

    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})

    #enable stealth mode
    stealth(driver,
    languages=["en-US", "en"],
    vendor="Google Inc.",
    platform="Win32",
    webgl_vendor="Intel Inc.",
    renderer="Intel Iris OpenGL Engine",
    fix_hairline=True,
      )


    driver.get(website)
    time.sleep(2)

    #fetch all links in the webpage
    links_list = driver.find_elements(By.TAG_NAME, "a")
    l = []
    time.sleep(2)

    #loop through each link
    for links in links_list:
        time.sleep(2)
        l.append(links.get_attribute('href'))


    #driver.quit()
    return l

In [None]:
#move to the folder where the scraped data is on your device
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/.shortcut-targets-by-id/182DL1ZoX52TDPWfu53xujONm-OQdyT_7/WiChat/Uniccon


In [None]:
get_links("https://uniccongroup.com/")

['https://uniccongroup.com/',
 'https://uniccongroup.com/blog',
 'https://uniccongroup.com/about-us',
 'https://uniccongroup.com/',
 'https://uniccongroup.com/blog',
 'https://uniccongroup.com/about-us',
 'https://www.youtube.com/watch?si=xEmazZ52w8zqcWq2&v=Tq-lSFCeBx8&feature=youtu.be',
 'https://www.youtube.com/watch?si=xEmazZ52w8zqcWq2&v=Tq-lSFCeBx8&feature=youtu.be',
 'https://chuksprimeorganics.com/',
 'https://omeife.ai/',
 'https://smartafrilabs.com/',
 'https://babasky.com/',
 'https://twitter.com/uniccongroup',
 'https://www.linkedin.com/company/uniccongroup/mycompany/',
 'https://www.facebook.com/Uniccongroup',
 'https://www.instagram.com/uniccongroup/',
 'https://uniccongroup.com/',
 'https://uniccongroup.com/',
 'https://uniccongroup.com/',
 'https://uniccongroup.com/blog',
 'https://uniccongroup.com/about-us',
 'https://twitter.com/uniccongroup',
 'https://www.linkedin.com/company/uniccongroup/mycompany/',
 'https://www.facebook.com/Uniccongroup',
 'https://www.instagram.c

In [148]:
def scrape(link):
  # initialise driver options
    options = initialize_driver_options()

    # request for a random and different user agent everytime
    user_agent = random.choice(user_agent_list)

    options.add_argument(f'user-agent={user_agent}')

    #initialise the fortified scraper
    driver = webdriver.Chrome(options=options)

    driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": user_agent})

    #enable stealth mode
    stealth(driver,
    languages=["en-US", "en"],
    vendor="Google Inc.",
    platform="Win32",
    webgl_vendor="Intel Inc.",
    renderer="Intel Iris OpenGL Engine",
    fix_hairline=True,
      )


    content = " "
    driver.get(link)
    titles = driver.find_elements(By.TAG_NAME, "h1")
    for t in titles:
        content += t.text + "\n"

    h2 = driver.find_elements(By.TAG_NAME, "h2")
    for t in h2:
        content += t.text + "\n"

    h3 = driver.find_elements(By.TAG_NAME, "h3")
    for t in h3:
        content += t.text + "\n"

    h4 = driver.find_elements(By.TAG_NAME, "h4")
    for t in h4:
        content += t.text + "\n"

    words = driver.find_elements(By.TAG_NAME, "p")
    for t in words:
        content += t.text + "\n"

    save_page_content(link, content)

In [121]:
# scraping the home page
scrape("https://uniccongroup.com")

In [None]:
# scraping the about us page
scrape("https://uniccongroup.com/about-us")

In [123]:
scrape("https://chuksprimeorganics.com")

In [137]:
scrape("https://omeife.ai/")

In [126]:
scrape("https://smartafrilabs.com")

In [149]:
scrape("https://babasky.com/")

We focus on utilizing unmanned aerial vehicle technlogy to provide innovative solutions. Our goal is to transform industries, empower businesses and drive progress through aerial intelligence in Nigeria.

















In [144]:
scrape("https://smartmedicares.com/")

In [142]:
scrape("https://omeife.ai/about-us/")

In [143]:
scrape("https://omeife.ai/hackathon/")

In [139]:
scrape("https://omeife.ai/features/")