# Scraping Job Data

This project focuses on collecting job-related data from Glassdoor using Selenium and BeautifulSoup. The data includes job titles, descriptions, and other relevant information to help analyze job market trends.

## Import Libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.options import Options

from tqdm import tqdm
import time
import pandas as pd
from bs4 import BeautifulSoup
import json


## Configure Selenium WebDriver

In [None]:
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--disable-gpu") 
chrome_options.add_argument("start-maximized")
chrome_options.add_argument("disable-infobars")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)

# Part A: Scraping Job URLs

## 1. Define Functions for Scraping

In [5]:
def extract_jobs(driver):
    """Extract Job URLs from the current page."""
    urls = []
    
    try:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        json_ld = soup.find('script', type='application/ld+json')
        if json_ld:
            data = json.loads(json_ld.string)
            job_list = data.get('itemListElement', [])
            for job in job_list :
                urls.append(job.get('url'))
        else:
            print("No JSON-LD found!")
            
    except Exception as e:
        print(f"Error parsing jobs: {e}")
    
    return urls

def auto(company, driver):
    """Automate the scraping process for a given company."""
    company = company.replace(" ","-")
    url = f'https://www.glassdoor.com/Job/united-states-{company}-jobs.htm'
    driver.get(url)
    
    wait = WebDriverWait(driver, 10)
    current_urls = []
    urls = extract_jobs(driver)
    current_urls.extend(urls)
    
    return current_urls, driver

## 2. Main Scraping Logic

In [None]:
companies_name_followers = pd.read_csv("companies_name_by_followers.csv")
companies_name_list = companies_name_followers["name"].tolist()

urls_list = []
for company in tqdm(companies_name_list, desc='Scraping Companies'):
    driver = webdriver.Chrome(options=chrome_options)
    current_urls,driver = auto(comp,driver)
    urls_list += current_urls
    driver.quit()
print(f'Total URLs collected: {len(urls_list)}')

## 3. Save Results

In [None]:
df_job_urls = pd.DataFrame({"Job_URL":urls_list})
df_job_urls.to_csv('job_urls.csv', index=False)
print('Saved to job_urls.csv')

# Part B: Scraping Job Descriptions from the Job URLs

## 1. Define Functions for Scraping

In [None]:
def auto_job_description(url,driver):
    """Extract job desrcriptions from the current job page."""

    driver.get(url)  
    wait = WebDriverWait(driver, 10) 
    
    # Handle 'Show More' button if it exists
    try:
        show_more_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[span[text()='Show more']]")))
        driver.execute_script("arguments[0].scrollIntoView(true);", show_more_button)
        driver.execute_script("arguments[0].click();", show_more_button)  # Perform the click via JavaScript
        time.sleep(2)  
    except Exception as e:
        print(f"No 'Show More' button or an error occurred on URL {url}: {e}")
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    json_ld = soup.find("script", type="application/ld+json")
    
    if json_ld:
        data = json.loads(json_ld.string)  
        job_name = data.get("title", "N/A")  
        job_description = data.get("description", "N/A") 
        return url, job_name, job_description, driver
    else:
        print(f"No JSON-LD found on URL: {url}")
        return None

## 2. Main Scraping Logic

In [None]:
job_urls = df_job_urls["Job_URL"].tolist()
job_description_dict = {"Job_Url":[], "Job_Name":[], "Job_Description":[]}

for url in tqdm(job_urls, desc="Scraping job description", unit="job url"):
    
    driver = webdriver.Chrome(options=chrome_options)
    result = auto_job_description(url,driver)
    if result == None:
        continue
    current_url, current_name, current_description ,driver = result
    driver.quit()
    
    job_desc_dict["Job_Url"].append(current_url)
    job_desc_dict["Job_Name"].append(current_name)
    job_desc_dict["Job_Description"].append(current_description)

## 3. Clean description

In [None]:
def clean_text(text):
    """ Clean the scraped job description""" 
    soup = BeautifulSoup(text, "html.parser")
    text_content = soup.get_text(separator="\n")
    cleaned_text = re.sub(r'\n+', ' ', text_content)
    return cleaned_text

## 4. Apply clean Data & Save Results

In [None]:
df_job_descriptions = pd.DataFrame(job_description_dict)
df_job_descriptions["Job_Description"] = df_job_descriptions["Job_Description"].apply(lambda x: clean_text(x))
df_job_descriptions.to_csv('job_descriptions.csv', index=False)
print('Saved to job_descriptions.csv')