# **Web Scraping**

In [None]:
!apt update
!apt install -y chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
%pip install selenium
%pip install pandas openpyxl

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,607 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,701 kB]
Get:13 http://archive.ubuntu.com/ubunt

In [None]:
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import json

In [None]:
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--window-size=1920x1080')
    chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [None]:
def scrape_jobs(base_url, start_page=1, end_page=135, save_to_json=False, output_file="jobs.json"):
    driver = setup_driver()
    total_jobs = 0
    jobs_list = []

    for page in range(start_page, end_page + 1):
        url = f"{base_url}&start={(page - 1)}"
        print(f"Scraping page {page}...")
        try:
            driver.get(url)
            time.sleep(2)
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            job_cards = soup.find_all('div', {'class': 'css-1gatmva e1v1l3u10'})

            if not job_cards:
                print("No jobs found. Might be blocked.")
                break

            for card in job_cards:
                try:
                    job_title = card.find("h2", {"class": "css-m604qf"})
                    company = card.find('a', {'class': 'css-17s97q8'})
                    location = card.find('span', {'class': 'css-5wys0k'})
                    job_skills = card.select('div > div.css-y4udm8 > div')
                    jobs_type = card.find("div", {"class": "css-1lh32fc"})
                    published_time = card.select_one('div > div.css-laomuu > div > div')
                    job_data = {
                        "job_title": job_title.text.strip() if job_title else None,
                        "company": company.text.strip() if company else None,
                        "location": location.text.strip() if location else None,
                        "job_skills": [skill.text.strip() for skill in job_skills] if job_skills else [],
                        "jobs_type": jobs_type.text.strip() if jobs_type else None,
                        "published_time": published_time.text.strip() if published_time else None,
                    }

                    jobs_list.append(job_data)
                    print(f"{job_data['job_title']} at {job_data['company']} - {job_data['location']}...\n")
                    total_jobs += 1

                except Exception as e:
                    print(f"Error extracting job: {e}")
                    continue

            print(f"Page {page} done. Jobs scraped so far: {total_jobs}")

        except Exception as e:
            print(f"Error on page {page}: {e}")
            continue

    driver.quit()

    if save_to_json:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(jobs_list, f, ensure_ascii=False, indent=4)
        print(f"Saved {total_jobs} jobs to {output_file}")

    print(f"Finished scraping {total_jobs} jobs.")


Our target website is Wuzzuf, let's apply our scraping function on it

In [None]:
BASE_URL = 'https://wuzzuf.net/search/jobs/?a=hpb&q=data'
scrape_jobs(BASE_URL, start_page=1, end_page=135,save_to_json=True,output_file="jobs.json")

Scraping page 1...
Data Analysis Manager at WUZZUF - - Maadi, Cairo, Egypt...

Data Entry Clerk at Confidential - - Downtown, Cairo, Egypt...

Data Entry Clerk at Choozle - - Barka', Oman...

Data Entry Specialist at British House  - - Mokattam, Cairo, Egypt...

Data Entry at Global Tires Trading and Distribution - - New Cairo, Cairo, Egypt...

Data Entry at MARK ROVER - - Nasr City, Cairo, Egypt...

Remote Data Analyst & Processor at HIGHBASE TRADING W.L.L - - Cairo, Egypt...

Repair Order Specialist (Data Analyst) at OPPO Egypt  - - Nasr City, Cairo, Egypt...

Data Analyst and Sales Admin at TOP EDUCATION ACADEMY  - - Agouza, Giza, Egypt...

Data Governance Senior Analyst at Tasaheel Finance  - - Giza, Egypt...

Customs Clearance Data Entry Assistant (Shipments Data on Nafeza Platform) at Fastn Accurate For shipping and logistics services   - - Sheraton, Cairo, Egypt...

Data Entry Clerk at GLC Paints - - Obour City, Cairo, Egypt...

Data Entry – Product Submission at Dora Management

In [None]:
import pandas as pd

df = pd.read_json('jobs.json')
df.to_excel('jobs final.xlsx', index=False)

print("Done")

Done


**last run for this code and updating the scrapped data: 26/4/2025 4:00 PM**