In [11]:
import nest_asyncio

from playwright.async_api import async_playwright

BASE_URL = "https://foorilla.com/hiring/"

async def scrape_all_jobs():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(BASE_URL)
        await page.wait_for_selector('a[href*="/hiring/jobs/"]')

        # Infinite scroll until no new jobs load
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(1.5)
            new_height = await page.evaluate("document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract job listings
        jobs = []
        links = await page.query_selector_all('a[href*="/hiring/jobs/"]')
        seen_urls = set()
        for link in links:
            url = await link.get_attribute("href")
            if not url or url in seen_urls:
                continue
            seen_urls.add(url)

            title_el = await link.query_selector(".title")
            company_el = await link.query_selector(".company")
            location_el = await link.query_selector(".region")

            jobs.append({
                "title": (await title_el.inner_text()) if title_el else None,
                "company": (await company_el.inner_text()) if company_el else None,
                "location": (await location_el.inner_text()) if location_el else None,
                "url": url
            })

        await browser.close()
        return jobs

nest_asyncio.apply()
if __name__ == "__main__":
    results = asyncio.get_event_loop().run_until_complete(scrape_all_jobs())
    print(f"Found {len(results)} jobs")
    for i, job in enumerate(results, 1):
        print(f"{i:03d}. {job['title']} | {job['company']} | {job['location']} \n    {job['url']}")


Found 2 jobs
001. None | None | None 
    /hiring/jobs/latest/export/?format=csv
002. None | None | None 
    /hiring/jobs/latest/export/?format=json


In [12]:
print(results)

[{'title': None, 'company': None, 'location': None, 'url': '/hiring/jobs/latest/export/?format=csv'}, {'title': None, 'company': None, 'location': None, 'url': '/hiring/jobs/latest/export/?format=json'}]


In [13]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()  # needed in Codespaces/Jupyter interactive

BASE_URL = "https://foorilla.com/hiring/"

async def scrape_all_jobs():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(BASE_URL)
        await page.wait_for_selector('a[href*="/hiring/jobs/"]')

        # Infinite scroll
        last_height = await page.evaluate("document.body.scrollHeight")
        while True:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            await asyncio.sleep(1.5)
            new_height = await page.evaluate("document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        jobs = []
        seen_urls = set()

        links = await page.query_selector_all('a[href*="/hiring/jobs/"]')
        for link in links:
            url = await link.get_attribute("href")

            # Skip if export links or already seen
            if not url or "export" in url or url in seen_urls:
                continue
            seen_urls.add(url)

            title_el = await link.query_selector(".title")
            company_el = await link.query_selector(".company")
            location_el = await link.query_selector(".region")

            # Skip if no title found (not a real job card)
            if not title_el:
                continue

            jobs.append({
                "title": (await title_el.inner_text()) if title_el else None,
                "company": (await company_el.inner_text()) if company_el else None,
                "location": (await location_el.inner_text()) if location_el else None,
                "url": url
            })

        await browser.close()
        return jobs


# Run in Codespaces interactive
results = asyncio.get_event_loop().run_until_complete(scrape_all_jobs())
print(f"Found {len(results)} jobs")
for i, job in enumerate(results, 1):
    print(f"{i:03d}. {job['title']} | {job['company']} | {job['location']} \n    {job['url']}")


Found 0 jobs


In [14]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def scrape_hiring_jobs():
    jobs = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # change to False for debug
        page = await browser.new_page()
        await page.goto("https://foorilla.com/hiring", wait_until="networkidle")
        
        # Wait a bit for dynamic content to load
        await page.wait_for_timeout(3000)

        # Select all job cards
        job_cards = await page.query_selector_all("ul#hiring a.col.py-2")
        
        for card in job_cards:
            title = await card.query_selector(".title")
            company = await card.query_selector(".company:not(.region)")
            location = await card.query_selector(".region.company")

            jobs.append({
                "title": await title.inner_text() if title else None,
                "company": await company.inner_text() if company else None,
                "location": await location.inner_text() if location else None,
                "url": await card.get_attribute("href")
            })

        await browser.close()
    return jobs

# Run the scraper
async def main():
    job_list = await scrape_hiring_jobs()
    print(f"Found {len(job_list)} jobs")
    for i, job in enumerate(job_list, 1):
        print(f"{i:03d}. {job['title']} | {job['company']} | {job['location']}\n    {job['url']}")

asyncio.get_event_loop().run_until_complete(main())


Found 0 jobs


In [15]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def inspect_hiring_page():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # Change to False if you can see UI
        page = await browser.new_page()
        await page.goto("https://foorilla.com/hiring", wait_until="networkidle")
        await page.wait_for_timeout(3000)  # wait for JS to load jobs
        
        html_content = await page.content()
        print(html_content[:1500])  # preview start of DOM

        await browser.close()

asyncio.get_event_loop().run_until_complete(inspect_hiring_page())


<!DOCTYPE html><html class="h-100 overflow-hidden" lang="en" data-bs-theme="dark" style="--content-height: 670px;"><head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width,minimum-scale=1,initial-scale=1,maximum-scale=1">
<title>all coding</title>
<meta name="description" content="The career platform for coders, builders, hackers and makers.">
<link rel="canonical" href="https://foorilla.com/hiring/">
<link href="/s/css/style.min.css?v=0.9.17" rel="stylesheet">
<link href="/s/admin/css/vendor/select2/select2.min.css?v=0.9.17" rel="stylesheet">
<link href="/s/django_select2/django_select2.css?v=0.9.17" rel="stylesheet">
<link rel="shortcut icon" href="/s/img/favicon.ico">
<link rel="apple-touch-icon" sizes="180x180" href="/s/img/apple-icon-180x180.png">
<link rel="apple-touch-icon" sizes="180x180" href="/s/img/apple-icon-152x152.png">
<link rel="apple-touch-icon" sizes="180x180" href="/s/img/apple-icon-76x76.png">
<link rel="apple-touch-icon" sizes="180x180" href=

In [16]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def inspect_job_list():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # set to False if you want UI
        page = await browser.new_page()
        await page.goto("https://foorilla.com/hiring", wait_until="networkidle")
        await page.wait_for_selector("ul#hiring")  # wait until job list exists
        html_content = await page.inner_html("ul#hiring")  # only get job list HTML
        with open("hiring_jobs.html", "w", encoding="utf-8") as f:
            f.write(html_content)
        print("✅ Saved job list HTML to hiring_jobs.html")
        await browser.close()

asyncio.get_event_loop().run_until_complete(inspect_job_list())




TimeoutError: Page.wait_for_selector: Timeout 30000ms exceeded.
Call log:
  - waiting for locator("ul#hiring") to be visible


In [17]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def dump_full_body():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://foorilla.com/hiring", wait_until="networkidle")
        await page.wait_for_timeout(5000)  # wait for JS to run
        
        html_content = await page.content()
        with open("full_body.html", "w", encoding="utf-8") as f:
            f.write(html_content)
        
        print("✅ Saved full page HTML to full_body.html")
        await browser.close()

asyncio.get_event_loop().run_until_complete(dump_full_body())


✅ Saved full page HTML to full_body.html


In [18]:
import asyncio
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

async def scrape_hiring_jobs():
    jobs = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto("https://foorilla.com/hiring", wait_until="networkidle")
        await page.wait_for_timeout(3000)  # let jobs load

        job_cards = await page.query_selector_all("a.col.py-2")
        
        for card in job_cards:
            title = await card.query_selector(".title")
            company = await card.query_selector(".company:not(.region)")
            location = await card.query_selector(".region.company")

            jobs.append({
                "title": await title.inner_text() if title else None,
                "company": await company.inner_text() if company else None,
                "location": await location.inner_text() if location else None,
                "url": await card.get_attribute("href")
            })

        await browser.close()
    return jobs

async def main():
    job_list = await scrape_hiring_jobs()
    print(f"Found {len(job_list)} jobs")
    for i, job in enumerate(job_list, 1):
        print(f"{i:03d}. {job['title']} | {job['company']} | {job['location']}\n    {job['url']}")

asyncio.get_event_loop().run_until_complete(main())


Found 0 jobs


In [19]:
import asyncio
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import nest_asyncio

nest_asyncio.apply()

BASE = "https://foorilla.com"
START = f"{BASE}/hiring/"

async def scrape_all_jobs():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(START, wait_until="networkidle")

        # Wait for the jobs column container
        await page.wait_for_selector("#mc_1")

        # Infinite-scroll the jobs list container (not the window)
        last_count = -1
        while True:
            # Count current jobs
            count = await page.locator("#mc_1 li.list-group-item a.stretched-link").count()
            if count == last_count:
                # No new items; stop
                break
            last_count = count

            # Scroll the #mc_1 column to the bottom to trigger htmx "intersect once"
            await page.evaluate("""
                () => {
                  const el = document.querySelector('#mc_1');
                  if (el) el.scrollTo(0, el.scrollHeight);
                }
            """)
            await page.wait_for_timeout(1200)  # give htmx time to fetch & render

        # Select all job rows
        rows = page.locator("#mc_1 li.list-group-item")
        n = await rows.count()
        jobs = []

        for i in range(n):
            row = rows.nth(i)
            link = row.locator("a.stretched-link")
            if await link.count() == 0:
                continue

            title = (await link.inner_text()).strip()

            # URL is in hx-get (href is empty)
            hx_get = await link.get_attribute("hx-get")
            url = urljoin(BASE, hx_get) if hx_get else None

            # time-ago (right side small in the first hstack)
            time_ago = None
            time_small = row.locator(".hstack .flex-shrink-0.text-body-secondary small").first
            if await time_small.count():
                time_ago = (await time_small.inner_text()).strip()

            # location (right side .text-end small in the second hstack)
            location = None
            loc_small = row.locator(".hstack .text-end small").first
            if await loc_small.count():
                location = (await loc_small.inner_text()).strip()

            jobs.append({
                "title": title or None,
                "location": location or None,
                "time_ago": time_ago or None,
                "url": url
            })

        await browser.close()
        return jobs

# Run in Codespaces/Jupyter
results = asyncio.get_event_loop().run_until_complete(scrape_all_jobs())
print(f"Found {len(results)} jobs")
for i, job in enumerate(results, 1):
    print(f"{i:03d}. {job['title']} | {job['location']} | {job['time_ago']}\n    {job['url']}")


Found 98 jobs
001. Software Development Engineer 2-6 | India - Bangalore - Remote Office [R] | 3h ago
    https://foorilla.com/hiring/jobs/software-development-engineer-2-6-india-bangalore-remote-office-819036/
002. IT Infrastructure Engineer | Hong Kong | 3h ago
    https://foorilla.com/hiring/jobs/it-infrastructure-engineer-hong-kong-819211/
003. Senior Software Engineer | INBLR02 - Bangalore - Milesstone Buildcon, … | 3h ago
    https://foorilla.com/hiring/jobs/senior-software-engineer-inblr02-bangalore-milesstone-buildcon-india-819781/
004. Software System Engineer | Shenyang - PIC, China | 3h ago
    https://foorilla.com/hiring/jobs/software-system-engineer-shenyang-pic-china-819978/
005. Junior Automation Tester | RO - BUCHAREST BULEVARDUL ION MIHALACHE … | 3h ago
    https://foorilla.com/hiring/jobs/junior-automation-tester-ro-bucharest-bulevardul-ion-mihalache-15-17-romania-820019/
006. Staff Facilities Engineer (Electrical) | Singapore - Woodlands - NorthTech | 3h ago
    http

In [20]:
import time
import re
from urllib.parse import urljoin, urlencode
import requests
from bs4 import BeautifulSoup

BASE = "https://foorilla.com"
LIST_PATH = "/hiring/jobs/"  # HTMX list endpoint used by the site
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    )
}

def fetch_html(url, params=None, sleep=0.4):
    """GET a URL and return BeautifulSoup."""
    if params:
        url = f"{url}?{urlencode(params)}"
    r = requests.get(url, headers=HEADERS, timeout=20)
    r.raise_for_status()
    time.sleep(sleep)  # be polite; avoid hammering
    return BeautifulSoup(r.text, "html.parser")

def parse_list_items(soup):
    """Parse one listing chunk (HTML returned by /hiring/jobs/?page=N)."""
    jobs = []
    # Each job row looks like: <li class="list-group-item"> ... <a class="stretched-link" hx-get="/hiring/jobs/<slug>/">Title</a> ...
    rows = soup.select("li.list-group-item")
    for row in rows:
        a = row.select_one("a.stretched-link")
        if not a:
            continue
        title = a.get_text(strip=True)
        hx_get = a.get("hx-get")  # detail endpoint
        url = urljoin(BASE, hx_get) if hx_get else None

        # time-ago (first hstack right side)
        time_ago_el = row.select_one(".hstack .flex-shrink-0.text-body-secondary small")
        time_ago = time_ago_el.get_text(strip=True) if time_ago_el else None

        # location (second hstack right side)
        location_el = row.select_one(".hstack .text-end small")
        location = location_el.get_text(strip=True) if location_el else None

        # optional: salary badge lives alongside tags; we can sniff it
        # typically has class 'text-success-emphasis'
        salary_el = row.select_one(".text-success-emphasis")
        salary = salary_el.get_text(strip=True) if salary_el else None

        jobs.append({
            "title": title or None,
            "url": url,
            "location": location or None,
            "time_ago": time_ago or None,
            "salary": salary or None,
        })
    return jobs

def find_next_page(soup):
    """
    Pagination is exposed as a sentinel <li> that contains:
    hx-get="/hiring/jobs/?page=N"
    We grab the max N we see (usually one).
    """
    li = soup.select_one('li.list-group-item[hx-get*="?page="]')
    if not li:
        return None
    hx = li.get("hx-get")
    # hx looks like /hiring/jobs/?page=2
    m = re.search(r"[?&]page=(\d+)", hx or "")
    return int(m.group(1)) if m else None

def fetch_job_detail(detail_url):
    """Fetch job detail panel HTML (optional)."""
    s = fetch_html(detail_url)
    # Example: extract main header and description text
    # The detail panel markup can vary; adjust selectors as needed
    # Try a few best-effort selectors:
    title = s.select_one("h1, h2, .h3, .job-title")
    desc = s.select_one("article, .content, .prose, .job-description, .markdown")
    apply_link = s.select_one('a[href*="apply"], a[href*="careers"], a[target="_blank"]')
    return {
        "detail_title": title.get_text(strip=True) if title else None,
        "detail_snippet": (desc.get_text(" ", strip=True)[:500] + "…") if desc else None,
        "apply_link": apply_link.get("href") if apply_link else None,
    }

def scrape_all_jobs(include_details=False, max_pages=None):
    """
    Crawl /hiring/jobs/ page by page via HTMX endpoints.
    If include_details=True, fetch each job detail panel too.
    """
    page = 1
    all_jobs = []

    while True:
        soup = fetch_html(urljoin(BASE, LIST_PATH), params=None if page == 1 else {"page": page})
        # On first call (/hiring/jobs/) we get page 1; for subsequent pages, use ?page=N

        chunk_jobs = parse_list_items(soup)
        if not chunk_jobs:
            break
        all_jobs.extend(chunk_jobs)

        if include_details:
            for job in chunk_jobs:
                if job["url"]:
                    try:
                        detail = fetch_job_detail(job["url"])
                        job.update(detail)
                    except Exception as e:
                        # non-fatal; keep going
                        job["detail_error"] = str(e)

        if max_pages and page >= max_pages:
            break

        next_p = find_next_page(soup)
        if not next_p or next_p == page:
            break
        page = next_p

    return all_jobs

if __name__ == "__main__":
    jobs = scrape_all_jobs(include_details=False)  # set True to fetch detail panel too
    print(f"Collected {len(jobs)} jobs")
    for i, j in enumerate(jobs, 1):
        print(f"{i:03d}. {j['title']} | {j.get('location') or '—'} | {j.get('time_ago') or '—'} | {j.get('salary') or '—'}")
        print(f"     {j['url']}")


Collected 0 jobs


In [21]:
import re
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE = "https://foorilla.com"
START = f"{BASE}/hiring/"
LIST = f"{BASE}/hiring/jobs/"

UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)

def get_csrf_and_session():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    r = s.get(START, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    body = soup.find("body")
    # The page sets hx-headers via a JS expression in the body tag.
    # We’ll scrape the CSRF token from there if present; otherwise continue without it.
    csrf = None
    body_str = str(body) if body else ""
    m = re.search(r'"X-CSRFToken"\s*:\s*"([^"]+)"', body_str)
    if m:
        csrf = m.group(1)
    return s, csrf

def get_list_page(session, page=None, csrf=None):
    # HTMX-style headers the backend expects
    hdrs = {
        "User-Agent": UA,
        "Accept": "text/html, */*; q=0.01",
        "Referer": START,
        "HX-Request": "true",
        "HX-Target": "mc_1",
        "HX-Current-URL": START,
        # The site sets "X-Screen" to 'M' (mobile) or 'D' (desktop) via JS.
        # We can just send 'D' so we get desktop list markup.
        "X-Screen": "D",
    }
    if csrf:
        hdrs["X-CSRFToken"] = csrf

    url = LIST if not page or page == 1 else f"{LIST}?page={page}"
    r = session.get(url, headers=hdrs, timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_jobs(soup):
    jobs = []
    rows = soup.select("li.list-group-item")
    for row in rows:
        a = row.select_one("a.stretched-link")
        if not a:
            continue
        title = a.get_text(strip=True)
        hx_get = a.get("hx-get")  # detail endpoint lives here
        url = urljoin(BASE, hx_get) if hx_get else None

        time_ago_el = row.select_one(".hstack .flex-shrink-0.text-body-secondary small")
        time_ago = time_ago_el.get_text(strip=True) if time_ago_el else None

        loc_el = row.select_one(".hstack .text-end small")
        location = loc_el.get_text(strip=True) if loc_el else None

        sal_el = row.select_one(".text-success-emphasis")
        salary = sal_el.get_text(strip=True) if sal_el else None

        jobs.append({
            "title": title or None,
            "url": url,
            "location": location or None,
            "time_ago": time_ago or None,
            "salary": salary or None,
        })
    return jobs

def scrape_all_jobs(max_pages=None, pause=0.5):
    session, csrf = get_csrf_and_session()
    all_jobs = []
    page = 1
    while True:
        soup = get_list_page(session, page=page, csrf=csrf)
        chunk = parse_jobs(soup)
        if not chunk:
            break
        all_jobs.extend(chunk)

        # If there’s another page, there will usually be a sentinel li with hx-get="?page=N"
        # We can just try the next integer page; stop when empty.
        page += 1
        if max_pages and page > max_pages:
            break
        time.sleep(pause)  # be polite

    # de-duplicate by URL
    seen = set()
    unique = []
    for j in all_jobs:
        key = j.get("url")
        if key and key not in seen:
            seen.add(key)
            unique.append(j)
    return unique

if __name__ == "__main__":
    jobs = scrape_all_jobs()  # add max_pages=3 to test quickly
    print(f"Collected {len(jobs)} jobs")
    for i, j in enumerate(jobs, 1):
        print(f"{i:03d}. {j['title']} | {j.get('location') or '—'} | {j.get('time_ago') or '—'} | {j.get('salary') or '—'}")
        print(f"     {j['url']}")


Collected 100 jobs
001. Software Development Engineer 2-6 | India - Bangalore - Remote Office[R] | 4h ago | —
     https://foorilla.com/hiring/jobs/software-development-engineer-2-6-india-bangalore-remote-office-819036/
002. IT Infrastructure Engineer | Hong Kong | 4h ago | —
     https://foorilla.com/hiring/jobs/it-infrastructure-engineer-hong-kong-819211/
003. Senior Software Engineer | INBLR02 - Bangalore - Milesstone Buildcon, … | 4h ago | —
     https://foorilla.com/hiring/jobs/senior-software-engineer-inblr02-bangalore-milesstone-buildcon-india-819781/
004. Software System Engineer | Shenyang - PIC, China | 4h ago | —
     https://foorilla.com/hiring/jobs/software-system-engineer-shenyang-pic-china-819978/
005. Junior Automation Tester | RO - BUCHAREST BULEVARDUL ION MIHALACHE … | 4h ago | —
     https://foorilla.com/hiring/jobs/junior-automation-tester-ro-bucharest-bulevardul-ion-mihalache-15-17-romania-820019/
006. Staff Facilities Engineer (Electrical) | Singapore - Woodlands 

In [23]:
import re
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE = "https://foorilla.com"
START = f"{BASE}/hiring/"
LIST = f"{BASE}/hiring/jobs/"

UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)

def get_csrf_and_session():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    r = s.get(START, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    body_str = str(soup.find("body") or "")
    m = re.search(r'"X-CSRFToken"\s*:\s*"([^"]+)"', body_str)
    csrf = m.group(1) if m else None
    return s, csrf

def get_list_page(session, page=None, csrf=None):
    hdrs = {
        "User-Agent": UA,
        "Accept": "text/html, */*; q=0.01",
        "Referer": START,
        "HX-Request": "true",
        "HX-Target": "mc_1",
        "HX-Current-URL": START,
        "X-Screen": "D",
    }
    if csrf:
        hdrs["X-CSRFToken"] = csrf
    url = LIST if not page or page == 1 else f"{LIST}?page={page}"
    r = session.get(url, headers=hdrs, timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_jobs(soup):
    jobs = []
    rows = soup.select("li.list-group-item")
    for row in rows:
        a = row.select_one("a.stretched-link")
        if not a:
            continue
        title = a.get_text(strip=True)
        hx_get = a.get("hx-get")
        url = urljoin(BASE, hx_get) if hx_get else None
        jobs.append({"title": title or None, "url": url})
    return jobs

def extract_job_info(job_url):
    """Extract full job details from its detail page."""
    try:
        r = requests.get(job_url, headers={"User-Agent": UA}, timeout=20)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")

        job_title = soup.select_one("h1").get_text(strip=True) if soup.select_one("h1") else None
        company = soup.select_one(".text-body-secondary a")
        company_name = company.get_text(strip=True) if company else None
        description_el = soup.select_one(".job-description") or soup.select_one(".lead")
        description = description_el.get_text(strip=True) if description_el else None

        return {
            "job_title": job_title,
            "company": company_name,
            "description": description,
            "url": job_url
        }
    except Exception as e:
        print(f"Error fetching {job_url}: {e}")
        return None

def scrape_all_jobs(max_pages=None, pause=0.5):
    session, csrf = get_csrf_and_session()
    all_jobs = []
    page = 1
    while True:
        soup = get_list_page(session, page=page, csrf=csrf)
        chunk = parse_jobs(soup)
        if not chunk:
            break
        all_jobs.extend(chunk)
        page += 1
        if max_pages and page > max_pages:
            break
        time.sleep(pause)
    return all_jobs

if __name__ == "__main__":
    job_listings = scrape_all_jobs()
    job_info_list = []

    for job in job_listings:
        job_url = job.get("url")
        if not job_url:
            continue
        job_info = extract_job_info(job_url)
        try:
            print(job_info["job_title"])
            job_info_list.append(job_info)
        except Exception:
            print(f"Could not extract info from: {job_url}")
            continue

    print(f"\n✅ Extracted {len(job_info_list)} job details.")


None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None

✅ Extracted 100 job details.


In [25]:
import re
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE = "https://foorilla.com"
START = f"{BASE}/hiring/"
LIST  = f"{BASE}/hiring/jobs/"

UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)

def get_session_and_csrf():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    r = s.get(START, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    body = soup.find("body")
    csrf = None
    if body:
        m = re.search(r'"X-CSRFToken"\s*:\s*"([^"]+)"', str(body))
        if m:
            csrf = m.group(1)
    return s, csrf

def htmx_headers(csrf=None, target="mc_1"):
    hdrs = {
        "User-Agent": UA,
        "Accept": "text/html, */*; q=0.01",
        "Referer": START,
        "HX-Request": "true",
        "HX-Target": target,
        "HX-Current-URL": START,
        "X-Screen": "D",
    }
    if csrf:
        hdrs["X-CSRFToken"] = csrf
    return hdrs

def get_list_page(session, csrf, page=1):
    url = LIST if page == 1 else f"{LIST}?page={page}"
    r = session.get(url, headers=htmx_headers(csrf, target="mc_1"), timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_list_items(soup):
    jobs = []
    for row in soup.select("li.list-group-item"):
        a = row.select_one("a.stretched-link")
        if not a:
            continue
        title = a.get_text(strip=True)
        hx_get = a.get("hx-get")  # detail endpoint (relative)
        url = urljoin(BASE, hx_get) if hx_get else None

        time_ago_el = row.select_one(".hstack .flex-shrink-0.text-body-secondary small")
        time_ago = time_ago_el.get_text(strip=True) if time_ago_el else None

        loc_el = row.select_one(".hstack .text-end small")
        location = loc_el.get_text(strip=True) if loc_el else None

        sal_el = row.select_one(".text-success-emphasis")
        salary = sal_el.get_text(strip=True) if sal_el else None

        jobs.append({
            "listing_title": title or None,
            "url": url,
            "listing_location": location or None,
            "listing_time_ago": time_ago or None,
            "listing_salary": salary or None,
        })
    return jobs

def extract_job_info(session, csrf, job_url):
    """
    Fetch the job detail *fragment* with HTMX headers (target=mc_2),
    then parse generously for title/company/description/apply link.
    """
    r = session.get(job_url, headers=htmx_headers(csrf, target="mc_2"), timeout=20)
    r.raise_for_status()
    s = BeautifulSoup(r.text, "html.parser")

    # Titles can be h1/h2/h3 or a prominent link
    title_el = s.select_one("h1, h2, h3, .job-title, .card-title, .h3")
    job_title = title_el.get_text(strip=True) if title_el else None

    # Company often shows in a muted text block with a link
    company_el = s.select_one(".text-body-secondary a, .company a, a[rel*=nofollow]")
    company = company_el.get_text(strip=True) if company_el else None

    # Description: try a few common containers
    desc_el = s.select_one(".job-description, article, .content, .prose, .markdown, .card-body")
    description = desc_el.get_text(" ", strip=True) if desc_el else None

    # Apply link: look for obvious external links
    apply_el = s.select_one("a[href*='apply'], a[href*='careers'], a[target='_blank']")
    apply_link = apply_el.get("href") if apply_el else None
    if apply_link and apply_link.startswith("/"):
        apply_link = urljoin(BASE, apply_link)

    # Location/salary sometimes repeated in detail; fall back to listing if missing
    loc_el = s.select_one(".text-end small, .location, .badge:has(svg)")
    location = loc_el.get_text(strip=True) if loc_el else None

    sal_el = s.select_one(".text-success-emphasis, .salary, .badge.text-success-emphasis")
    salary = sal_el.get_text(strip=True) if sal_el else None

    return {
        "job_title": job_title,
        "company": company,
        "description": description,
        "apply_link": apply_link,
        "location": location,
        "salary": salary,
        "detail_url": job_url,
    }

def scrape_all_jobs(max_pages=None, pause=0.5):
    session, csrf = get_session_and_csrf()
    all_jobs = []
    page = 1
    while True:
        soup = get_list_page(session, csrf, page=page)
        chunk = parse_list_items(soup)
        if not chunk:
            break
        all_jobs.extend(chunk)
        page += 1
        if max_pages and page > max_pages:
            break
        time.sleep(pause)
    return session, csrf, all_jobs

if __name__ == "__main__":
    # 1) collect all listing URLs via HTMX
    session, csrf, job_listings = scrape_all_jobs()
    print(f"Found {len(job_listings)} listings")

    # 2) your requested loop to fetch & store detail info
    job_info_list = []
    for j in job_listings:
        job_url = j.get("url")
        if not job_url:
            continue
        info = extract_job_info(session, csrf, job_url)
        # merge list metadata with detail info, and print title
        if info and info.get("job_title"):
            info.update({
                "listing_title": j.get("listing_title"),
                "listing_location": j.get("listing_location"),
                "listing_time_ago": j.get("listing_time_ago"),
                "listing_salary": j.get("listing_salary"),
            })
            print(info["job_title"])
            job_info_list.append(info)
        else:
            print(f"Could not extract info from: {job_url}")
            continue

    print(f"\n✅ Extracted {len(job_info_list)} job details.")


Found 100 listings
Software Development Engineer 2-6
IT Infrastructure Engineer
Senior Software Engineer
Software System Engineer
Junior Automation Tester
Staff Facilities Engineer (Electrical)
Java Fullstack Developer - 2-4 Years
Senior Technical Support Engineer (Integration)
Teamcenter Systems Administrator (Onsite)
Database Engineer- Sr. Consultant level- NoSQL
Software Engineer OOP (Hybrid)
Senior Solution Engineer, Insurance
Service Now Developer
Cloud Infrastructure Engineer - Hybrid Tempe
Cybersecurity Engineer, M&A Automation
Senior Data Scientist
Staff Software Engineer
Software Engineer
Consultant- Data Analyst
Senior Data Engineer II
Senior Software Engineer II , Retail Pricing
Senior Software Engineer II, Commercial & Wealth
Staff Full-Stack Software Engineer, Dev Agent Tools
Business Analyst II (Data & Compliance Analyst II) - Legacy, IS Operational Experience
Staff Software Engineer, Lending
Staff Software Engineer, Tokenization
Senior Staff Test Engineer / Team Leader
A

In [27]:
import pandas as pd
df = pd.DataFrame(job_info_list)
df.head()

Unnamed: 0,job_title,company,description,apply_link,location,salary,detail_url,listing_title,listing_location,listing_time_ago,listing_salary
0,Software Development Engineer 2-6,@ ...,,https://foorilla.com/hiring/jobs/crahcVTeWfYpX...,India - Bangalore - Remote Office[R],USD 158K-210K,https://foorilla.com/hiring/jobs/software-deve...,Software Development Engineer 2-6,India - Bangalore - Remote Office[R],4h ago,
1,IT Infrastructure Engineer,@ ...,,https://foorilla.com/hiring/jobs/huZ0Ey1wRFuF3...,"Hong Kong, Hong Kong",,https://foorilla.com/hiring/jobs/it-infrastruc...,IT Infrastructure Engineer,Hong Kong,4h ago,
2,Senior Software Engineer,@ ...,,https://foorilla.com/hiring/jobs/vWTDGd6YSM6nm...,"INBLR02 - Bangalore - Milesstone Buildcon, …",,https://foorilla.com/hiring/jobs/senior-softwa...,Senior Software Engineer,"INBLR02 - Bangalore - Milesstone Buildcon, …",4h ago,
3,Software System Engineer,@ ...,,https://foorilla.com/hiring/jobs/Pv3MdopQXSFpw...,"Shenyang - PIC, China",,https://foorilla.com/hiring/jobs/software-syst...,Software System Engineer,"Shenyang - PIC, China",4h ago,
4,Junior Automation Tester,@ ...,,https://foorilla.com/hiring/jobs/CbBm2MRQuus8c...,UK - LONDON 25 ROPEMAKER STREET …,,https://foorilla.com/hiring/jobs/junior-automa...,Junior Automation Tester,RO - BUCHAREST BULEVARDUL ION MIHALACHE …,4h ago,


In [30]:
import re
import json
import time
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE = "https://foorilla.com"
START = f"{BASE}/hiring/"
LIST  = f"{BASE}/hiring/jobs/"

UA = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/124.0.0.0 Safari/537.36"
)

def get_session_and_csrf():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    r = s.get(START, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    csrf = None
    body = soup.find("body")
    if body:
        m = re.search(r'"X-CSRFToken"\s*:\s*"([^"]+)"', str(body))
        if m: csrf = m.group(1)
    return s, csrf

def htmx_headers(csrf=None, target="mc_1"):
    hdrs = {
        "User-Agent": UA,
        "Accept": "text/html, */*; q=0.01",
        "Referer": START,
        "HX-Request": "true",
        "HX-Target": target,
        "HX-Current-URL": START,
        "X-Screen": "D",
    }
    if csrf:
        hdrs["X-CSRFToken"] = csrf
    return hdrs

def get_list_page(session, csrf, page=1):
    url = LIST if page == 1 else f"{LIST}?page={page}"
    r = session.get(url, headers=htmx_headers(csrf, target="mc_1"), timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_list_items(soup):
    jobs = []
    for row in soup.select("li.list-group-item"):
        a = row.select_one("a.stretched-link")
        if not a: 
            continue
        title = a.get_text(strip=True)
        hx_get = a.get("hx-get")
        url = urljoin(BASE, hx_get) if hx_get else None
        jobs.append({"listing_title": title or None, "url": url})
    return jobs

def _parse_json_ld_for_salary(soup):
    """
    Look for <script type="application/ld+json"> with JobPosting schema,
    and extract baseSalary.value.minValue/maxValue if present.
    """
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
        except Exception:
            continue

        # Sometimes it's a list of things
        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue
            # Heuristics to find JobPosting nodes
            t = obj.get("@type")
            if t == "JobPosting" or (isinstance(t, list) and "JobPosting" in t):
                job_data = obj
                salary_obj = (job_data.get("baseSalary") or {}).get("value") or {}
                # Some sites use "minValue"/"maxValue"; others use "value" only
                salary_min = salary_obj.get("minValue")
                salary_max = salary_obj.get("maxValue")
                if salary_min or salary_max:
                    return salary_min, salary_max
    return None, None

def _parse_visible_salary_text(soup):
    """
    Fallback: read visible salary text like 'USD 110K-162K' and map to min/max numbers (best effort).
    Returns strings if parsing is uncertain.
    """
    badge = soup.select_one(".text-success-emphasis, .salary, .badge.text-success-emphasis")
    text = badge.get_text(" ", strip=True) if badge else None
    if not text:
        return None, None

    # Try to normalize formats like 'USD 110K-162K' or '$110,000–$162,000'
    # Extract two numbers, allowing K/M suffix
    m = re.findall(r'(\d+(?:[\d,]*)(?:\.\d+)?)([KkMm]?)', text)
    vals = []
    for num, suf in m:
        n = float(num.replace(",", ""))
        if suf.lower() == 'k':
            n *= 1_000
        elif suf.lower() == 'm':
            n *= 1_000_000
        vals.append(int(n))
    if len(vals) >= 2:
        return vals[0], vals[1]
    return None, None

def extract_job_info(session, csrf, job_url):
    """
    Fetch the job detail fragment with HTMX headers (target=mc_2),
    parse title/company/description, and salary_min/max as requested.
    """
    r = session.get(job_url, headers=htmx_headers(csrf, target="mc_2"), timeout=20)
    r.raise_for_status()
    s = BeautifulSoup(r.text, "html.parser")

    # Title
    title_el = s.select_one("h1, h2, h3, .job-title, .card-title, .h3")
    job_title = title_el.get_text(strip=True) if title_el else None

    # Company (best-effort)
    company_el = s.select_one(".text-body-secondary a, .company a, a[rel*=nofollow]")
    company_name = company_el.get_text(strip=True) if company_el else None

    # Description
    desc_el = s.select_one(".job-description, article, .content, .prose, .markdown, .card-body")
    job_description = desc_el.get_text(" ", strip=True) if desc_el else None

    # Salary via JSON-LD first
    salary_min, salary_max = _parse_json_ld_for_salary(s)
    if salary_min is None and salary_max is None:
        # Fallback to visible text badge
        salary_min, salary_max = _parse_visible_salary_text(s)

    # If still None, populate 'N/A' to match your requested shape
    salary_min = salary_min if salary_min is not None else 'N/A'
    salary_max = salary_max if salary_max is not None else 'N/A'

    return {
        'company_name': company_name,
        'job_title': job_title,
        'job_description': job_description,
        'salary_min': salary_min,
        'salary_max': salary_max,
    }

def scrape_all_jobs(max_pages=None, pause=0.5):
    session, csrf = get_session_and_csrf()
    all_jobs = []
    page = 1
    while True:
        soup = get_list_page(session, csrf, page=page)
        chunk = parse_list_items(soup)
        if not chunk:
            break
        all_jobs.extend(chunk)
        page += 1
        if max_pages and page > max_pages:
            break
        time.sleep(pause)
    return session, csrf, all_jobs

if __name__ == "__main__":
    session, csrf, job_listings = scrape_all_jobs()
    print(f"Found {len(job_listings)} listings")

    job_info_list = []
    for j in job_listings:
        job_url = j.get("url")
        if not job_url:
            continue
        info = extract_job_info(session, csrf, job_url)
        try:
            print(info["job_title"], info["company_name"], info["job_description"] )
            job_info_list.append(info)
        except Exception:
            print(f"Could not extract info from: {job_url}")
            continue

    print(f"\n✅ Extracted {len(job_info_list)} job details.")


Found 100 listings
Software Development Engineer 2-6 @ ... None
Junior Automation Tester @ ... None
Staff Facilities Engineer (Electrical) @ ... None
Senior Software Engineer @ ... None
IT Infrastructure Engineer @ ... None
Software System Engineer @ ... None
Java Fullstack Developer - 2-4 Years @ ... None
Senior Technical Support Engineer (Integration) @ ... None
Teamcenter Systems Administrator (Onsite) @ ... None
Database Engineer- Sr. Consultant level- NoSQL @ ... None
Software Engineer OOP (Hybrid) @ ... None
Senior Solution Engineer, Insurance @ ... None
Service Now Developer @ ... None
Cloud Infrastructure Engineer - Hybrid Tempe @ ... None
Cybersecurity Engineer, M&A Automation @ ... None
Senior Data Scientist @ ... None
Staff Software Engineer @ ... None
Software Engineer @ ... None
Consultant- Data Analyst @ ... None
Senior Data Engineer II @ ... None
Senior Software Engineer II , Retail Pricing @ ... None
Senior Software Engineer II, Commercial & Wealth @ ... None
Staff Full

In [29]:
import pandas as pd
df = pd.DataFrame(job_info_list)
df.head()

Unnamed: 0,company_name,job_title,job_description,salary_min,salary_max
0,@ ...,Software Development Engineer 2-6,,158000.0,210000.0
1,@ ...,Junior Automation Tester,,,
2,@ ...,Staff Facilities Engineer (Electrical),,,
3,@ ...,Senior Software Engineer,,,
4,@ ...,IT Infrastructure Engineer,,,


In [32]:
import re, json, html
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def _clean_html_text(s, keep_newlines=False):
    """Collapse HTML to readable text."""
    if s is None:
        return None
    # unescape & strip
    t = BeautifulSoup(s, "html.parser").get_text("\n" if keep_newlines else " ", strip=True)
    # squash whitespace
    t = re.sub(r"[ \t\r\f\v]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip() or None

def _extract_from_json_ld(soup):
    """
    Return (company_name, job_title, job_description, salary_min, salary_max)
    from JobPosting JSON-LD if present; otherwise all Nones.
    """
    for script in soup.find_all("script", type="application/ld+json"):
        raw = script.string or ""
        try:
            data = json.loads(raw)
        except Exception:
            continue
        nodes = data if isinstance(data, list) else [data]
        for obj in nodes:
            if not isinstance(obj, dict):
                continue
            t = obj.get("@type")
            if t == "JobPosting" or (isinstance(t, list) and "JobPosting" in t):
                # company / title / description
                company = None
                org = obj.get("hiringOrganization")
                if isinstance(org, dict):
                    company = org.get("name") or org.get("legalName")
                title = obj.get("title")
                desc = obj.get("description")
                desc = _clean_html_text(desc, keep_newlines=True)

                # salary
                salary_min = salary_max = None
                base = obj.get("baseSalary") or {}
                if isinstance(base, dict):
                    val = base.get("value")
                    if isinstance(val, dict):
                        salary_min = val.get("minValue")
                        salary_max = val.get("maxValue")

                return company, title, desc, salary_min, salary_max
    return None, None, None, None, None

def _visible_salary_min_max(soup):
    badge = soup.select_one(".text-success-emphasis, .salary, .badge.text-success-emphasis")
    txt = badge.get_text(" ", strip=True) if badge else None
    if not txt:
        return None, None
    nums = []
    for num, suf in re.findall(r'(\d+(?:[\d,]*)(?:\.\d+)?)([KkMm]?)', txt):
        n = float(num.replace(",", ""))
        if suf.lower() == 'k':
            n *= 1_000
        elif suf.lower() == 'm':
            n *= 1_000_000
        nums.append(int(n))
    if len(nums) >= 2:
        return nums[0], nums[1]
    return None, None

def _fallback_company(soup):
    # Prefer company directory links; ignore bare '@' icons
    cand = soup.select_one('a[href*="/hiring/companies/"], .company a, .text-body-secondary a')
    if cand:
        txt = cand.get_text(strip=True)
        if txt and txt != "@":
            return txt
    # Try og:site_name (last resort)
    meta = soup.find("meta", attrs={"property": "og:site_name"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    return None

def _fallback_title(soup):
    h = soup.select_one("h1, h2, h3, .job-title, .card-title, .h3")
    return h.get_text(strip=True) if h else None

def _fallback_description(soup):
    el = soup.select_one(".job-description, article, .content, .prose, .markdown, .card-body")
    if el:
        return _clean_html_text(str(el), keep_newlines=True)
    # fallback to meta description
    meta = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
    if meta and meta.get("content"):
        return meta["content"].strip()
    return None

def extract_job_info(session, csrf, job_url):
    """
    Fetch the HTMX detail fragment (target mc_2) and extract:
      company_name, job_title, job_description, salary_min, salary_max
    """
    hdrs = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept": "text/html, */*; q=0.01",
        "Referer": "https://foorilla.com/hiring/",
        "HX-Request": "true",
        "HX-Target": "mc_2",
        "HX-Current-URL": "https://foorilla.com/hiring/",
        "X-Screen": "D",
    }
    if csrf:
        hdrs["X-CSRFToken"] = csrf

    r = session.get(job_url, headers=hdrs, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")

    # 1) Prefer JSON-LD (most reliable)
    comp_json, title_json, desc_json, smin_json, smax_json = _extract_from_json_ld(soup)

    # 2) Fallbacks for each field
    company_name   = comp_json or _fallback_company(soup)
    job_title      = title_json or _fallback_title(soup)
    job_description= desc_json or _fallback_description(soup)

    salary_min, salary_max = smin_json, smax_json
    if salary_min is None and salary_max is None:
        salary_min, salary_max = _visible_salary_min_max(soup)

    # Normalize to your requested shape
    return {
        'company_name': company_name or 'N/A',
        'job_title': job_title or 'N/A',
        'job_description': job_description or 'N/A',
        'salary_min': salary_min if salary_min is not None else 'N/A',
        'salary_max': salary_max if salary_max is not None else 'N/A',
    }

if __name__ == "__main__":
    session, csrf, job_listings = scrape_all_jobs()
    print(f"Found {len(job_listings)} listings")

    job_info_list = []
    for j in job_listings:
        job_url = j.get("url")
        if not job_url:
            continue
        info = extract_job_info(session, csrf, job_url)
        try:
            print(info["job_title"], info["company_name"], info["job_description"] )
            job_info_list.append(info)
        except Exception:
            print(f"Could not extract info from: {job_url}")
            continue

    print(f"\n✅ Extracted {len(job_info_list)} job details.")

Found 100 listings
Software Development Engineer 2-6 @ ... N/A
Junior Automation Tester @ ... N/A
Staff Facilities Engineer (Electrical) @ ... N/A
Senior Software Engineer @ ... N/A
Software System Engineer @ ... N/A
Java Fullstack Developer - 2-4 Years @ ... N/A
IT Infrastructure Engineer @ ... N/A
Senior Technical Support Engineer (Integration) @ ... N/A
Teamcenter Systems Administrator (Onsite) @ ... N/A
Database Engineer- Sr. Consultant level- NoSQL @ ... N/A
Software Engineer OOP (Hybrid) @ ... N/A
Senior Solution Engineer, Insurance @ ... N/A
Service Now Developer @ ... N/A
Cloud Infrastructure Engineer - Hybrid Tempe @ ... N/A
Cybersecurity Engineer, M&A Automation @ ... N/A
Senior Data Scientist @ ... N/A
Staff Software Engineer @ ... N/A
Software Engineer @ ... N/A
Consultant- Data Analyst @ ... N/A
Senior Data Engineer II @ ... N/A
Senior Software Engineer II , Retail Pricing @ ... N/A
Senior Software Engineer II, Commercial & Wealth @ ... N/A
Staff Full-Stack Software Engine

In [33]:
import re, json, time, os
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

BASE  = "https://foorilla.com"
START = f"{BASE}/hiring/"
LIST  = f"{BASE}/hiring/jobs/"
UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

def session_and_csrf():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    r = s.get(START, timeout=20)
    r.raise_for_status()
    m = re.search(r'"X-CSRFToken"\s*:\s*"([^"]+)"', r.text)
    csrf = m.group(1) if m else None
    return s, csrf

def htmx_headers(csrf=None, target="mc_1"):
    h = {
        "User-Agent": UA,
        "Accept": "text/html, */*; q=0.01",
        "Referer": START,
        "HX-Request": "true",
        "HX-Target": target,
        "HX-Current-URL": START,
        "X-Screen": "D",
        # some servers key off this too
        "X-Requested-With": "XMLHttpRequest",
    }
    if csrf: h["X-CSRFToken"] = csrf
    return h

def list_page(s, csrf, page=1):
    url = LIST if page == 1 else f"{LIST}?page={page}"
    r = s.get(url, headers=htmx_headers(csrf, "mc_1"), timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_list(soup):
    out = []
    for row in soup.select("li.list-group-item"):
        a = row.select_one("a.stretched-link")
        if not a: continue
        hx = a.get("hx-get")
        if not hx: continue
        out.append(urljoin(BASE, hx))
    return out

def clean_text(s, keep_n=False):
    if not s: return None
    t = BeautifulSoup(s, "html.parser").get_text("\n" if keep_n else " ", strip=True)
    t = re.sub(r"[ \t\r\f\v]+", " ", t)
    t = re.sub(r"\n{3,}", "\n\n", t)
    return t.strip() or None

def from_json_ld(soup):
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(tag.string or "")
        except Exception:
            continue
        nodes = data if isinstance(data, list) else [data]
        for obj in nodes:
            if not isinstance(obj, dict): continue
            t = obj.get("@type")
            if t == "JobPosting" or (isinstance(t, list) and "JobPosting" in t):
                org = obj.get("hiringOrganization") or {}
                company = org.get("name") or org.get("legalName")
                title = obj.get("title")
                desc  = clean_text(obj.get("description"), keep_n=True)
                base  = obj.get("baseSalary") or {}
                val   = base.get("value") if isinstance(base, dict) else {}
                smin  = (val or {}).get("minValue")
                smax  = (val or {}).get("maxValue")
                return company, title, desc, smin, smax
    return None, None, None, None, None

def visible_salary_minmax(soup):
    badge = soup.select_one(".text-success-emphasis, .salary, .badge.text-success-emphasis")
    txt = badge.get_text(" ", strip=True) if badge else None
    if not txt: return None, None
    nums = []
    for num, suf in re.findall(r'(\d+(?:[\d,]*)(?:\.\d+)?)([KkMm]?)', txt):
        n = float(num.replace(",", ""))
        if suf.lower() == "k": n *= 1_000
        elif suf.lower() == "m": n *= 1_000_000
        nums.append(int(n))
    if len(nums) >= 2: return nums[0], nums[1]
    return None, None

def fallback_company(soup):
    cand = soup.select_one('a[href*="/hiring/companies/"], .company a, .text-body-secondary a')
    if cand:
        txt = cand.get_text(strip=True)
        if txt and txt != "@":
            return txt
    meta = soup.find("meta", attrs={"property": "og:site_name"})
    return meta.get("content").strip() if meta and meta.get("content") else None

def fallback_title(soup):
    h = soup.select_one("h1, h2, h3, .job-title, .card-title, .h3")
    if h: 
        t = h.get_text(strip=True)
        if t: return t
    meta = soup.find("meta", attrs={"property": "og:title"})
    return meta.get("content").strip() if meta and meta.get("content") else None

def fallback_desc(soup):
    el = soup.select_one(".job-description, article, .content, .prose, .markdown, .card-body")
    if el:
        return clean_text(str(el), keep_n=True)
    meta = soup.find("meta", attrs={"name": "description"}) or soup.find("meta", attrs={"property": "og:description"})
    return meta.get("content").strip() if meta and meta.get("content") else None

def fetch_detail_fragment(s, csrf, url, debug_slug=None):
    r = s.get(url, headers=htmx_headers(csrf, "mc_2"), timeout=20)
    r.raise_for_status()
    html = r.text
    if debug_slug:
        with open(f"debug_detail_{debug_slug}.html", "w", encoding="utf-8") as f:
            f.write(html)
    return BeautifulSoup(html, "html.parser")

def extract_job_info(s, csrf, job_url, debug_slug=None):
    soup = fetch_detail_fragment(s, csrf, job_url, debug_slug=debug_slug)

    # 1) JSON-LD first
    c_json, t_json, d_json, smin_json, smax_json = from_json_ld(soup)

    # 2) Fallbacks
    company = c_json or fallback_company(soup) or "N/A"
    title   = t_json or fallback_title(soup) or "N/A"
    desc    = d_json or fallback_desc(soup) or "N/A"

    smin, smax = smin_json, smax_json
    if smin is None and smax is None:
        smin, smax = visible_salary_minmax(soup)

    return {
        "company_name": company,
        "job_title": title,
        "job_description": desc,
        "salary_min": smin if smin is not None else "N/A",
        "salary_max": smax if smax is not None else "N/A",
    }

# ---- run a small end-to-end to verify and produce debug files ----
if __name__ == "__main__":
    s, csrf = session_and_csrf()
    soup = list_page(s, csrf, page=1)
    job_urls = parse_list(soup)
    if not job_urls:
        print("No job URLs found on page 1.")
        raise SystemExit(1)

    # save first 3 detail fragments for inspection
    for idx, ju in enumerate(job_urls[:3], 1):
        info = extract_job_info(s, csrf, ju, debug_slug=f"{idx}")
        print(idx, info)

    # your requested loop
    job_info_list = []
    for ju in job_urls:
        info = extract_job_info(s, csrf, ju)
        try:
            print(info["job_title"])
            job_info_list.append(info)
        except Exception:
            print(f"Could not extract info from: {ju}")
            continue

    print(f"\n✅ Extracted {len(job_info_list)} job details.")


1 {'company_name': '@ ...', 'job_title': 'Software Development Engineer 2-6', 'job_description': 'N/A', 'salary_min': 158000, 'salary_max': 210000}
2 {'company_name': '@ ...', 'job_title': 'Staff Facilities Engineer (Electrical)', 'job_description': 'N/A', 'salary_min': 'N/A', 'salary_max': 'N/A'}
3 {'company_name': '@ ...', 'job_title': 'Senior Software Engineer', 'job_description': 'N/A', 'salary_min': 'N/A', 'salary_max': 'N/A'}
Software Development Engineer 2-6
Staff Facilities Engineer (Electrical)
Senior Software Engineer
Java Fullstack Developer - 2-4 Years
Junior Automation Tester
IT Infrastructure Engineer
Software System Engineer
Senior Technical Support Engineer (Integration)
Teamcenter Systems Administrator (Onsite)
Database Engineer- Sr. Consultant level- NoSQL
Software Engineer OOP (Hybrid)
Senior Solution Engineer, Insurance
Service Now Developer
Cloud Infrastructure Engineer - Hybrid Tempe
Cybersecurity Engineer, M&A Automation
Senior Data Scientist
Staff Software Engin

In [36]:
import re
import time
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup

BASE  = "https://foorilla.com"
START = f"{BASE}/hiring/"
LIST  = f"{BASE}/hiring/jobs/"
UA = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
      "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36")

# ---------- helpers ----------
def session_and_csrf():
    s = requests.Session()
    s.headers.update({"User-Agent": UA})
    r = s.get(START, timeout=20)
    r.raise_for_status()
    m = re.search(r'"X-CSRFToken"\s*:\s*"([^"]+)"', r.text)
    csrf = m.group(1) if m else None
    return s, csrf

def htmx_headers(csrf=None, target="mc_1"):
    h = {
        "User-Agent": UA,
        "Accept": "text/html, */*; q=0.01",
        "Referer": START,
        "HX-Request": "true",
        "HX-Target": target,
        "HX-Current-URL": START,
        "X-Screen": "D",
        "X-Requested-With": "XMLHttpRequest",
    }
    if csrf: h["X-CSRFToken"] = csrf
    return h

def list_page(s, csrf, page=1):
    url = LIST if page == 1 else f"{LIST}?page={page}"
    r = s.get(url, headers=htmx_headers(csrf, "mc_1"), timeout=20)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def parse_list_items(soup):
    """Return list of dicts: {url, listing_salary, listing_location, listing_title}"""
    out = []
    for row in soup.select("li.list-group-item"):
        a = row.select_one("a.stretched-link")
        if not a: 
            continue
        hx_get = a.get("hx-get")
        if not hx_get:
            continue
        url = urljoin(BASE, hx_get)
        title = a.get_text(strip=True)

        # badge salary text like "USD 135K-180K"
        sal_el = row.select_one(".text-success-emphasis")
        listing_salary = sal_el.get_text(" ", strip=True) if sal_el else None

        loc_el = row.select_one(".hstack .text-end small")
        location = loc_el.get_text(strip=True) if loc_el else None

        out.append({
            "url": url,
            "listing_title": title,
            "listing_salary": listing_salary,
            "listing_location": location
        })
    return out

def fetch_detail_fragment_html(s, csrf, detail_url):
    r = s.get(detail_url, headers=htmx_headers(csrf, "mc_2"), timeout=20)
    r.raise_for_status()
    return r.text  # <-- THIS is fragment_html

def parse_salary_range_text(text):
    if not text:
        return None, None
    nums = []
    for num, suf in re.findall(r'(\d+(?:[\d,]*)(?:\.\d+)?)([KkMm]?)', text):
        n = float(num.replace(",", ""))
        if suf.lower() == 'k': n *= 1_000
        elif suf.lower() == 'm': n *= 1_000_000
        nums.append(int(n))
    if len(nums) >= 2:
        return nums[0], nums[1]
    return None, None

def infer_company_from_domain(url):
    try:
        host = urlparse(url).netloc.lower()
        host = re.sub(r'^(jobs?|careers?|boards?|app|apply)\.', '', host)
        ats_hosts = ('greenhouse.io','lever.co','workday.com','myworkdayjobs.com',
                     'smartrecruiters.com','ashbyhq.com','bamboohr.com','jobvite.com',
                     'icims.com','eightfold.ai','recruitee.com')
        if any(h in host for h in ats_hosts):
            return None
        parts = host.split('.')
        core = parts[-2] if len(parts) >= 2 else parts[0]
        core = re.sub(r'[^a-z0-9]+', ' ', core).strip()
        return core.title() if core else None
    except Exception:
        return None

# ---------- extraction from fragment ----------
def extract_job_info_from_fragment(fragment_html, listing_salary_text=None):
    soup = BeautifulSoup(fragment_html, "html.parser")

    # Title
    h = soup.select_one("h1, h2, h3, .job-title, .card-title, .h3")
    job_title = h.get_text(strip=True) if h else None

    # Company often anonymized as '@ ...'
    company_link = soup.select_one(".hstack strong a, .text-body-secondary a, .company a")
    company_text = (company_link.get_text(strip=True) if company_link else None)
    company_name = None if (not company_text or company_text.startswith('@')) else company_text

    # Build description from visible sections
    parts = []
    def get_section(label_regex, next_selector="ul"):
        label = soup.find(string=re.compile(label_regex, re.I))
        if label:
            parent = label.find_parent()
            if parent:
                nxt = parent.find_next(next_selector)
                if nxt:
                    if next_selector == "ul":
                        items = [li.get_text(" ", strip=True) for li in nxt.find_all("li")]
                        return "; ".join(items) if items else nxt.get_text(" ", strip=True)
                    else:
                        return nxt.get_text(" ", strip=True)
        return None

    tasks = get_section(r"^\s*Tasks:\s*$", "ul")
    if tasks and tasks != "N/A":
        parts.append("Tasks: " + tasks)

    skills = get_section(r"Skills/Tech-stack required:", "div")
    if skills and skills != "N/A":
        parts.append("Skills: " + skills)

    perks = get_section(r"Perks/Benefits:", "ul")
    if perks and perks != "N/A":
        parts.append("Perks: " + perks)

    job_description = "\n".join(parts) if parts else None

    # Salary min/max from listing badge (preferred), else from fragment badge
    salary_text = listing_salary_text
    if not salary_text:
        badge = soup.select_one(".text-success-emphasis, .salary, .badge.text-success-emphasis")
        salary_text = badge.get_text(" ", strip=True) if badge else None
    salary_min, salary_max = parse_salary_range_text(salary_text)

    # Apply link (for potential company inference)
    apply_el = soup.select_one('a.btn.btn-primary[href*="/apply/"]')
    apply_link = urljoin(BASE, apply_el["href"]) if apply_el and apply_el.has_attr("href") else None

    return {
        'company_name': company_name,  # may be None if anonymized
        'job_title': job_title,
        'job_description': job_description,
        'salary_min': salary_min if salary_min is not None else 'N/A',
        'salary_max': salary_max if salary_max is not None else 'N/A',
        'apply_link': apply_link
    }

def resolve_apply_destination(session, apply_url):
    try:
        r = session.head(apply_url, headers={"User-Agent": UA, "Referer": START}, allow_redirects=True, timeout=20)
        return r.url
    except Exception:
        try:
            r = session.get(apply_url, headers={"User-Agent": UA, "Referer": START}, allow_redirects=True, timeout=20)
            return r.url
        except Exception:
            return None

# ---------- main ----------
if __name__ == "__main__":
    s, csrf = session_and_csrf()

    # get first N pages (remove max_pages to crawl all)
    all_listings = []
    page = 1
    while True:
        soup = list_page(s, csrf, page=page)
        chunk = parse_list_items(soup)
        if not chunk:
            break
        all_listings.extend(chunk)
        page += 1
        # optional: limit pages while testing
        if page > 3:
            break
        time.sleep(0.4)

    print(f"Listings collected: {len(all_listings)}")

    job_info_list = []
    for j in all_listings:
        job_url = j["url"]
        # === THIS populates fragment_html ===
        fragment_html = fetch_detail_fragment_html(s, csrf, job_url)

        info = extract_job_info_from_fragment(
            fragment_html,
            listing_salary_text=j.get("listing_salary")
        )

        # If company hidden, try to infer via Apply redirect
        if not info['company_name'] and info.get('apply_link'):
            dest = resolve_apply_destination(s, info['apply_link'])
            inferred = infer_company_from_domain(dest) if dest else None
            if inferred:
                info['company_name'] = inferred

        # Print and store
        print(info['job_title'], "|", info['company_name'] or "N/A",
              "|", info['salary_min'], "-", info['salary_max'])
        job_info_list.append({
            'company_name': info['company_name'] or 'N/A',
            'job_title': info['job_title'] or 'N/A',
            'job_description': info['job_description'] or 'N/A',
            'salary_min': info['salary_min'],
            'salary_max': info['salary_max']
        })

    print(f"\n✅ Extracted {len(job_info_list)} jobs.")


Listings collected: 100
Software Development Engineer 2-6 | N/A | 158000 - 210000
Staff Facilities Engineer (Electrical) | N/A | N/A - N/A
Senior Software Engineer | N/A | N/A - N/A
Java Fullstack Developer - 2-4 Years | N/A | N/A - N/A
IT Infrastructure Engineer | N/A | N/A - N/A
Software System Engineer | N/A | N/A - N/A
Junior Automation Tester | N/A | N/A - N/A
Senior Technical Support Engineer (Integration) | N/A | N/A - N/A
Teamcenter Systems Administrator (Onsite) | N/A | 105000 - 125000
Database Engineer- Sr. Consultant level- NoSQL | N/A | 135000 - 196000
Software Engineer OOP (Hybrid) | N/A | 73000 - 85000
Senior Solution Engineer, Insurance | N/A | 269000 - 364000
Service Now Developer | N/A | N/A - N/A
Cloud Infrastructure Engineer - Hybrid Tempe | N/A | 126000 - 200000
Cybersecurity Engineer, M&A Automation | N/A | 116000 - 164000
Senior Data Scientist | N/A | 117000 - 165000
Staff Software Engineer | N/A | 149000 - 204000
Software Engineer | N/A | 85000 - 110000
Consultan

In [37]:
df = pd.DataFrame(job_info_list)
df.head()

Unnamed: 0,company_name,job_title,job_description,salary_min,salary_max
0,,Software Development Engineer 2-6,Tasks: * Build and maintain scalable microserv...,158000.0,210000.0
1,,Staff Facilities Engineer (Electrical),Tasks: * Conduct load analysis and risk assess...,,
2,,Senior Software Engineer,Tasks: * Adapt to changing project priorities;...,,
3,,Java Fullstack Developer - 2-4 Years,Tasks: * Assess risks and ensure compliance; *...,,
4,,IT Infrastructure Engineer,Tasks: * Apply patches and updates; * Build an...,,


In [38]:
df.to_csv("data/ai_job_data.csv", index=False)

In [39]:
df = pd.read_csv("data/ai_job_data.csv")
df = df.dropna()
df.head()

Unnamed: 0,company_name,job_title,job_description,salary_min,salary_max
61,Adp,Ops Test Analyst - China Lake CA,Tasks: * Analyze pre-test predictions and data...,83000.0,132000.0
63,Adp,Backup and Recovery Administrator,Tasks: * Administer backup and recovery operat...,85000.0,90000.0
67,Livehire,Associate Flood Modeller / Hydrologist,Tasks: * Conduct flood impact and waterway ass...,123000.0,123000.0
68,Adp,"Senior Manager, Retail Analytics","Tasks: * Collaborate with marketing, finance, ...",100000.0,125000.0
69,Adp,Cyber Threat Analyst 1,Tasks: * Analyze security events in SIEM envir...,107000.0,120000.0


In [42]:
def standardize_job_title(title):
    """
    Standardizes a given job title string by returning a predefined canonical job title without adding seniority levels.

    Args:
        title (str): The job title string to be standardized.

    Returns:
        str: A standardized job title based on predefined categories, handling compound roles
             and specialized AI-related job functions.
    """
    title = title.lower()
    
    # Define keyword mappings for specialized roles
    role_mappings = {
        "AI Scientist": ["ai scientist", "ml scientist", "ai/ml scientist"],
        "Prompt Engineer": ["prompt engineer"],
        "Deep Learning Engineer": ["deep learning", "dl engineer", "deep learning systems"],
        "Data Scientist": ["data scientist", "decision scientist", "analytics scientist"],
        "Decision Science Analyst": ["decision science", "decision analyst"],
        "Software Engineer": ["software engineer", "full stack", "backend", "frontend", "performance engineer"],
        "Data Engineer": ["data engineer", "etl developer", "pipeline engineer", "spark", "scala", "aws"],
        "ML Ops Engineer": ["ml ops", "model development", "ml operations", "applied data science"],
        "Security Engineer": ["security engineer", "data security"],
        "Electrical Engineer": ["electrical design", "data center design"],
        "Manager": ["manager", "product manager", "project manager"],
        "Director": ["director", "head of", "vp", "vice president"],
        "Intern": ["intern", "trainee"],
        "Researcher": ["researcher", "research engineer"],
    }

    # Check for keywords in the title and return the corresponding standardized title
    for standard_title, keywords in role_mappings.items():
        if any(keyword in title for keyword in keywords):
            return standard_title
    
    # Return "Other" if no match is found
    return "Other"

In [45]:
# Apply the regex-based function to the job_title column
df['standardized_job_title'] = df['job_title'].apply(standardize_job_title)

print("Num raw job titles:", df['job_title'].nunique())
print("Num standardized job titles:", df['standardized_job_title'].nunique())

# sum max salary for each standardized job title
s_jobs = df.groupby('standardized_job_title')['salary_max'].mean()
s_jobs = s_jobs.sort_values()

# convert to dataframe
df_jobs = s_jobs.reset_index()
df_jobs.columns = ["Job Title", "Mean Salary"]
df_jobs.head()


def extract_skills(description):
    """
    Extracts AI-related skills from a given job description.

    Args:
        description (str): The job description text to search for skills.

    Returns:
        list: A list of skills found in the job description, matched from a predefined set of common AI-related skills.

    Notes:
        - The function defines a list of common AI-related skills, including programming languages, frameworks,
          cloud platforms, and statistical concepts.
        - The input description is converted to lowercase to ensure case-insensitive matching.
        - Skills are detected using regular expressions to match whole words, avoiding partial matches (e.g., 
          "spark" will not match "sparking").
    """
    
    # Define a list of common AI-related skills
    skills_list = [
        "python", "r", "java", "c++", "sql", "scala", "spark", "hadoop", "tensorflow", "pytorch",
        "keras", "scikit-learn", "machine learning", "deep learning", "nlp", "natural language processing",
        "computer vision", "data analysis", "data engineering", "big data", "ai", "artificial intelligence",
        "cloud", "aws", "azure", "gcp", "docker", "kubernetes", "linux", "flask", "django", "pandas",
        "numpy", "matplotlib", "seaborn", "plotly", "etl", "api", "statistics", "probability", "regression",
        "classification", "clustering", "time series", "neural networks", "bayesian methods", "git", "mlops"
    ]

    description = description.lower()
    found_skills = [skill for skill in skills_list if re.search(rf"\b{re.escape(skill)}\b", description)]
    
    return found_skills

Num raw job titles: 10
Num standardized job titles: 2


In [47]:
# Apply the function to extract skills from each job description
df['extracted_skills'] = df['job_description'].apply(lambda x: extract_skills(str(x)))
df['extracted_skills'].head()

# create a list with all the skills from the JDs
all_skills = [skill for skills in df['extracted_skills'] for skill in skills]

# count skill occurances
skill_counts = Counter(all_skills)

# Convert the skill counts to a DataFrame
df_skills = pd.DataFrame(skill_counts.items(), columns=["Skill", "Count"]).sort_values(by="Count")
df_skills.head()

Unnamed: 0,Skill,Count
2,r,1
7,etl,1
6,aws,1
4,ai,1
9,kubernetes,1


In [48]:
from collections import Counter

import plotly.express as px
from dash import dcc, html, Dash

# Create the plots
bar_chart = dcc.Graph(
    id='top-roles',
    figure=px.bar(df.sort_values(by='salary_max', ascending=False), 
                  x='standardized_job_title', 
                  y='salary_max', 
                  color='company_name', 
                  title='Highest Paying AI Jobs',
                  labels={'salary_max': 'Maximum Salary', 'job_title': 'Job Title'},
                  height=500)
)

top_jobs_chart = dcc.Graph(
    id='top-jobs-chart',
    figure=px.bar(
        df_jobs[-5:],
        y='Job Title',      
        x='Mean Salary',
        title='Top 5 Roles',
        height=250
    ).update_layout(
        xaxis_title_font_size=12,    # Reduce x-axis label font size
        yaxis_title_font_size=12,    # Reduce y-axis label font size
        xaxis_tickfont_size=10,      # Reduce x-axis tick label font size
        yaxis_tickfont_size=10       # Reduce y-axis tick label font size
    )
)

top_skills_chart = dcc.Graph(
    id='top-skills-chart',
    figure=px.bar(
        df_skills[-5:],
        y='Skill',      
        x='Count',
        title='Top 5 Skills',
        height=250
    ).update_layout(
        xaxis_title_font_size=12,    # Reduce x-axis label font size
        yaxis_title_font_size=12,    # Reduce y-axis label font size
        xaxis_tickfont_size=10,      # Reduce x-axis tick label font size
        yaxis_tickfont_size=10       # Reduce y-axis tick label font size
    )
)

In [55]:
# Initialize the Dash app
app = Dash(__name__)

# App layout with styled divs for positioning
app.layout = html.Div([
    html.Div(bar_chart, style={'width': '70%', 'display': 'inline-block', 'vertical-align': 'top'}),
    html.Div([
        html.Div(top_jobs_chart, style={'height': '50%'}),
        html.Div(top_skills_chart, style={'height': '50%'})
    ], style={'width': '30%', 'display': 'inline-block', 'vertical-align': 'top'})
])

# Run the app
#app.run(jupyter_mode="external")

app.run(host="0.0.0.0", port=9000, debug=False)