# Data Jobs in Morocco: Glassdoor Web scraping Using Selenium

> I got the help from this [comment](https://www.kaggle.com/code/srikardornala/web-scrapping-using-selenium-beginners-guide/comments#2287808) in order to install chrome and chromium webdriver in Kaggle

# Installing Google Chrome, ChromeDriver and Selenium

In [None]:
!apt-get update -y
!apt-get install -y \
libglib2.0-0 \
libnss3 \
libdbus-glib-1-2 \
libgconf-2-4 \
libfontconfig1 \
gconf2-common \
libwayland-server0 \
libgbm1 \
udev \
libu2f-udev 

In [None]:
!apt --fix-broken install -y  

In [None]:
# install google chrome
!wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!dpkg -i google-chrome-stable_current_amd64.deb

In [None]:
!google-chrome --version

In [None]:
# install chromedriver
!wget https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/121.0.6167.85/linux64/chromedriver-linux64.zip	
!unzip chromedriver-linux64.zip -d /usr/local/bin/

In [None]:
# Install selenium
!pip install selenium

# Configuring Driver options 

In [None]:
import pandas as pd
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
import os
from bs4 import BeautifulSoup

In [None]:
# Setup chrome options
options = Options()
options.add_argument('--no-sandbox')
options.add_argument('--disable-setuid-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--remote-debugging-port=9222')

# Create service
service = Service('/usr/local/bin/chromedriver-linux64/chromedriver')

driver = webdriver.Chrome(service = service, options = options)

# Launching the driver and beginning the scraping

In [None]:
# Go to the Glassdoor page while searching for data jobs in Morocco
page_url = "https://www.glassdoor.co.in/Job/morocco-data-jobs-SRCH_IL.0,7_IN162_KO8,12.htm"
driver.get(page_url)

#driver.implicitly_wait(10) # seconds
# Find Job Cards
job_cards = driver.find_elements(By.CLASS_NAME, 'JobCard_trackingLink__zUSOo')

# Get the links list for each card
for card in job_cards:
    card_url = card.get_attribute('href')
    print(card_url)

In [None]:
# Go to each job and get the job details
driver.implicitly_wait(10) # seconds
jobs_list = []
for card in job_cards:
    card_url = card.get_attribute('href')
    driver.get(card_url)

    company = driver.find_element(By.CLASS_NAME, 'EmployerProfile_employerName__8w0tV')
    print(company.text)
    title = driver.find_element(By.CLASS_NAME, 'JobDetails_jobTitle__Rw_gn')
    location = driver.find_element(By.CLASS_NAME, 'JobDetails_location__MbnUM')

    job_details_section = driver.find_element(By.CLASS_NAME, 'JobDetails_jobDescription__6VeBn')
    # Extract the HTML content of the job details section
    html_content = job_details_section.get_attribute("outerHTML")

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Find all paragraphs and list items within the job details section
    description_paragraphs = soup.find_all(["p", "li"])
    
    description = "\n".join(paragraph.get_text() for paragraph in description_paragraphs)
    
    # Add the jobs details to the list
    jobs_list.append({"company": company.text, "location": location.text, "job_title": title.text, "description": description})
    driver.implicitly_wait(2) # seconds
# Close the WebDriver
driver.quit()

# Saving the jobs list as excel file

In [None]:
# Convert the list to a pandas DataFrame
df = pd.DataFrame(jobs_list)

# Define the file path including the file name
file_path = "/kaggle/working/jobs_list.xlsx"

# Save the DataFrame to an Excel file
df.to_excel(file_path, index=False)

print(f"DataFrame has been saved to {file_path}")