# Example Web Scraper
## ABB 2 - Session 1

Code authored by: Shaw Talebi

Note: Live code example from class. Something went wrong toward the end and we didn't have time to debug :(.

See [example 1](https://github.com/ShawhinT/AI-Builders-Bootcamp-2/blob/main/session-1/example_1-scrape_job_board.ipynb) for working version of this project.

### imports

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

### Extract job posting urls

In [2]:
# URL of the job board
url = "https://aijobs.net"

# Send a GET request to fetch the HTML content
response = requests.get(url)

In [3]:
# Parse the HTML content with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

In [4]:
# Regex pattern to match job links with a numeric ID
job_url_pattern = re.compile(r"^/job/\d+-")

# Find all <a> tags with href that match the job URL pattern
job_link_list = []
for a_tag in soup.find_all('a', href=True):
    href = a_tag['href']
    if job_url_pattern.match(href):
        full_url = f"{url}{href}"
        job_link_list.append(full_url)

### Extract job info from urls
- Job title
- Company name
- salary range
- JD

In [5]:
def extract_job_details_from_url(url):
    """
    Fetches a job listing page from the given URL and extracts job details,
    including job title, company name, salary range, and job description.

    Args:
        url (str): The URL of the job listing page.

    Returns:
        dict: A dictionary containing the following keys:
            - 'job_title' (str): The job title.
            - 'company_name' (str): The name of the company.
            - 'salary_range' (str): The salary range, if available; otherwise 'Not specified'.
            - 'min_salary' (str): The minimum salary extracted from the range, if available.
            - 'max_salary' (str): The maximum salary extracted from the range, if available.
            - 'job_description' (str): The job description text; otherwise 'No description available'.
    """
    # Send a GET request to fetch the HTML content
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code != 200:
        return {'error': f'Failed to fetch URL. Status code: {response.status_code}'}
    
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract Job Title
    job_title_tag = soup.find('h1', class_='display-5')
    job_title = job_title_tag.text.strip() if job_title_tag else 'Not specified'

    # Extract Company Name
    company_name_tag = soup.find('h2', class_='h5')
    company_name = company_name_tag.text.strip() if company_name_tag else 'Not specified'

    # Extract Salary Range (badge with class 'text-bg-success' containing 'USD')
    salary_tag = soup.find('span', class_='badge rounded-pill text-bg-success my-1')
    salary_range = salary_tag.text.strip() if salary_tag else 'Not specified'

    # Use regex to extract min and max salary
    salary_match = re.search(r'USD\s(\d+K)\s-\s(\d+K)', salary_range)
    if salary_match:
        min_salary = salary_match.group(1)  # First capturing group: min salary
        max_salary = salary_match.group(2)  # Second capturing group: max salary
    else:
        min_salary = max_salary = 'Not specified'

    # Extract Job Description
    job_description_section = soup.find('div', id='job-description')
    job_description = job_description_section.get_text(separator='\n').strip() if job_description_section else 'No description available'

    return {
        'job_title': job_title,
        'company_name': company_name,
        'salary_range': salary_range,
        'min_salary': min_salary,
        'max_salary': max_salary,
        'job_description': job_description
    }

In [6]:
job_list = []
for job_link in job_link_list:
    job_list.append(extract_job_details_from_url(url))

In [9]:
job_list[0]

{'job_title': 'Not specified',
 'company_name': 'Not specified',
 'salary_range': 'Not specified',
 'min_salary': 'Not specified',
 'max_salary': 'Not specified',
 'job_description': 'No description available'}

In [7]:
df = pd.DataFrame(job_list)

In [8]:
df.head()

Unnamed: 0,job_title,company_name,salary_range,min_salary,max_salary,job_description
0,Not specified,Not specified,Not specified,Not specified,Not specified,No description available
1,Not specified,Not specified,Not specified,Not specified,Not specified,No description available
2,Not specified,Not specified,Not specified,Not specified,Not specified,No description available
3,Not specified,Not specified,Not specified,Not specified,Not specified,No description available
4,Not specified,Not specified,Not specified,Not specified,Not specified,No description available
