In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

PRODUCT_MNCS = {
    "amazon","google","microsoft","oracle","meta","facebook","apple",
    "salesforce","adobe","sap","vmware","intuit","uber","netflix","paypal"
}

def categorize(name):
    n = name.lower()
    if any(p in n for p in PRODUCT_MNCS):
        return "Product MNC"
    if any(x in n for x in ["tech","labs","solutions","services","analytics","systems"]):
        return "MNC"
    return "Startup"

def scrape_adzuna():
    url = "https://www.adzuna.in/search/etl%20developer%20informatica%20talend%20sql"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    jobs = []
    for card in soup.select(".job__card"):
        t = card.select_one("a.job__title")
        c = card.select_one(".job__company")
        if not (t and c): continue
        jobs.append({
            "Title": t.text.strip(),
            "Company": c.text.strip(),
            "CompanyType": categorize(c.text.strip()),
            "Source": "Adzuna",
            "Link": t["href"]
        })
    return jobs

def scrape_glassdoor():
    url = "https://www.glassdoor.co.in/Job/india-etl-developer-informatica-and-talend-jobs-SRCH_IL.0,5_IN115_KO6,42.htm"
    headers = {"User-Agent": "Mozilla/5.0"}
    resp = requests.get(url, headers=headers)
    soup = BeautifulSoup(resp.text, "html.parser")
    jobs = []
    for card in soup.select(".react-job-listing"):
        title = card.get("data-job-title")
        company = card.get("data-company-name")
        link = card.get("data-url")
        if not (title and company and link): continue
        jobs.append({
            "Title": title.strip(),
            "Company": company.strip(),
            "CompanyType": categorize(company),
            "Source": "Glassdoor",
            "Link": "https://www.glassdoor.co.in" + link
        })
    return jobs

def main():
    all_jobs = scrape_adzuna() + scrape_glassdoor()
    if not all_jobs:
        print("❌ No jobs found — site structure may have changed.")
        return

    df = pd.DataFrame(all_jobs)
    df["Rank"] = df["CompanyType"].map({"Product MNC":0, "MNC":1, "Startup":2})
    df = df.sort_values(["Rank", "Source"]).drop(columns="Rank")
    df.to_excel("ETL_Jobs_List.xlsx", index=False)
    print(f"✅ Found and saved {len(df)} job listings to ETL_Jobs_List.xlsx")

if __name__ == "__main__":
    main()


❌ No jobs found — site structure may have changed.
