In [1]:
import requests
import json
import time
import uuid
from datetime import datetime
from azure.storage.blob import BlobServiceClient
from urllib.parse import quote_plus


In [2]:
# ---------- ADZUNA API (replace) ----------
APP_ID = "ba3ff03e"
APP_KEY = "1f22402e03b967fade52b904cf16bebf"

# ---------- AZURE (replace) ----------
connection_string = "DefaultEndpointsProtocol=https;AccountName=employeedata12113;AccountKey=dUfd9FfMWS/Qn6ineiQHIUg/77TL35StiKDnCEsfdhBQDpkQhm0UPN667Lr66o55BW6qOUCRy7qU+ASthpFngw==;EndpointSuffix=core.windows.net"
container_name = "jobdata"

# ---------- JOB QUERY SETTINGS ----------
# You can change 'what' to different roles/keywords or leave generic to collect many categories
what = ""          # empty = all, or "data analyst", "software engineer" etc.
results_per_page = 50   # max per page
pages_per_cycle = 2     # how many pages per city iteration (increase to collect more at once)

# Time control
num_cycles = 3     # how many cycles (rounds) you want to run
interval_between_requests = 1  # seconds between each API call to respect rate limits

# Connect to Azure
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)
print("Azure container client ready ->", container_name)


Azure container client ready -> jobdata


In [3]:
cities = [
 "Delhi","Mumbai","Bengaluru","Hyderabad","Ahmedabad","Chennai","Kolkata","Pune","Jaipur","Surat",
 "Lucknow","Kanpur","Nagpur","Indore","Bhopal","Visakhapatnam","Patna","Vadodara","Ghaziabad",
 "Ludhiana","Agra","Nashik","Faridabad","Meerut","Rajkot","Vasai-Virar","Varanasi","Srinagar",
 "Aurangabad","Dhanbad","Amritsar","Navi Mumbai","Howrah","Ranchi","Gwalior","Jabalpur",
 "Coimbatore","Vijayawada","Madurai","Raipur","Kota","Chandigarh","Guwahati","Solapur",
 "Hubli-Dharwad","Mysore","Noida","Jamshedpur","Bhilai","Cuttack","Kochi","Dehradun"
]
# (this is a good starting set; you can expand later)


In [4]:
def fetch_jobs_for_city(city, page=1, results_per_page=50, what=""):
    """
    Returns list of job dicts for the city and page.
    """
    base = "https://api.adzuna.com/v1/api/jobs/in/search/{page}"
    url = base.format(page=page)
    params = {
        "app_id": APP_ID,
        "app_key": APP_KEY,
        "results_per_page": results_per_page,
        "what": what,
        "where": city,
        "content-type": "application/json"
    }
    # build query string (requests will handle encoding)
    resp = requests.get(url, params=params, timeout=15)
    if resp.status_code != 200:
        print(f"Warning: status {resp.status_code} for {city} page {page}: {resp.text[:200]}")
        return []
    data = resp.json()
    return data.get("results", [])


In [5]:
def clean_title(title):
    # Remove " - Bangalore" or " – Delhi" etc.
    if " - " in title:
        return title.split(" - ")[0]
    if " – " in title:
        return title.split(" – ")[0]
    return title


def normalize_job_result(job):
    cleaned_title = clean_title(job.get("title", ""))

    return {
        "job_id": job.get("id"),
        "title": cleaned_title,  # cleaned title here
        "company": job.get("company", {}).get("display_name"),
        "location": job.get("location", {}).get("display_name"),
        "salary_min": job.get("salary_min"),
        "salary_max": job.get("salary_max"),
        "contract_type": job.get("contract_type"),
        "category": job.get("category", {}).get("label"),  # INDUSTRY
        "description_snippet": job.get("description"),
        "created": job.get("created"),
        "redirect_url": job.get("redirect_url"),
        "skills": job.get("tags"),
        "source": "adzuna",
        "ingested_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    }


In [6]:
def upload_json_to_blob(obj, prefix="jobs"):
    j = json.dumps(obj, ensure_ascii=False)
    blob_name = f"{prefix}_{obj.get('job_id','')}_{uuid.uuid4()}.json"
    # optional: encode blob_name safe
    blob_name = quote_plus(blob_name)
    container_client.upload_blob(blob_name, j, overwrite=True)
    return blob_name


In [7]:
print("Starting Adzuna -> Azure job ingestion loop")

count = 0
for cycle in range(num_cycles):
    print(f"\n=== Cycle {cycle+1}/{num_cycles} ===")
    for city in cities:
        for page in range(1, pages_per_cycle + 1):
            jobs = fetch_jobs_for_city(city, page=page, results_per_page=results_per_page, what=what)
            if not jobs:
                time.sleep(interval_between_requests)
                continue
            for job in jobs:
                normalized = normalize_job_result(job)
                blob_name = upload_json_to_blob(normalized, prefix=f"adzuna_job_{city.replace(' ','_')}")
                count += 1
                if count % 50 == 0:
                    print(f"Uploaded {count} job files so far...")
            time.sleep(interval_between_requests)   # small pause to respect rate limits
    print(f"Cycle {cycle+1} done. Total uploaded so far: {count}")

print(f"\nFinished. Total job files uploaded: {count}")


Starting Adzuna -> Azure job ingestion loop

=== Cycle 1/3 ===


  "ingested_at": datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")


Uploaded 50 job files so far...
Uploaded 100 job files so far...
Uploaded 150 job files so far...
Uploaded 200 job files so far...
Uploaded 250 job files so far...
Uploaded 300 job files so far...
Uploaded 350 job files so far...
Uploaded 400 job files so far...
Uploaded 450 job files so far...
Uploaded 500 job files so far...
Uploaded 550 job files so far...
Uploaded 600 job files so far...
Uploaded 650 job files so far...
Uploaded 700 job files so far...
Uploaded 750 job files so far...
Uploaded 800 job files so far...
Uploaded 850 job files so far...
Uploaded 900 job files so far...
Uploaded 950 job files so far...
Uploaded 1000 job files so far...
Uploaded 1050 job files so far...
Uploaded 1100 job files so far...
Uploaded 1150 job files so far...
Uploaded 1200 job files so far...
Uploaded 1250 job files so far...
Uploaded 1300 job files so far...
Uploaded 1350 job files so far...
Uploaded 1400 job files so far...
Uploaded 1450 job files so far...
Uploaded 1500 job files so far...
