In [19]:
# ==============================================
# Cell 1: Imports
# ==============================================
import requests
import psycopg2
import pandas as pd
import numpy as np
import re
import html
from sqlalchemy import create_engine, text
import sqlalchemy

In [20]:
print(sqlalchemy.__version__)

2.0.39


In [2]:
# ==============================================
# Cell 2: PostgreSQL setup
# ==============================================

# Local DB setup (Marcus)
L_DB_USER = '********'
L_DB_PASSWORD = '********1'
L_DB_HOST = 'localhost'
L_DB_NAME = 'Interim_Project_DB'

L_engine = create_engine(f'postgresql+psycopg2://{L_DB_USER}:{L_DB_PASSWORD}@{L_DB_HOST}/{L_DB_NAME}')
print("Connected to PostgreSQL")

# Online DB setup
OL_DB_USER = '********'
OL_DB_PASSWORD = '********'
OL_DB_HOST = 'jde08-ip-p2-angbj1976-c47c.c.aivencloud.com:15241'
OL_DB_NAME = 'Interim_Project_DB'

OL_engine = create_engine(f'postgresql+psycopg2://{OL_DB_USER}:{OL_DB_PASSWORD}@{OL_DB_HOST}/{OL_DB_NAME}')
print("Connected to Online PostgreSQL")

Connected to PostgreSQL
Connected to Online PostgreSQL


In [3]:
# ==============================================
# Cell 3: Adzuna API setup and fetch
# ==============================================

# === 1. Parameter setup for fetch job
APP_ID = "9cda74c2"       # Replace with your actual App ID
APP_KEY = "861aa79c8d5a02718d174aeead5ab710"   # Replace with your actual App Key

# === 2. Function to fetch job results ===
def fetch_adzuna_jobs(country_code="sg", pages=5, results_per_page=20):
    all_results = []

    for page in range(1, pages + 1):
        url = f"https://api.adzuna.com/v1/api/jobs/{country_code}/search/{page}"

        params = {
            "app_id": APP_ID,
            "app_key": APP_KEY,
            "results_per_page": results_per_page,
            "content-type": "application/json"
        }

        response = requests.get(url, params=params)

        if response.status_code != 200:
            print(f"Error on page {page}: HTTP {response.status_code}")
            break

        data = response.json()

        results = data.get("results", [])
        if not results:
            print(f"No more results at page {page}. Stopping.")
            break

        all_results.extend(results)
        print(f"Fetched {len(results)} items from page {page}")

    return all_results


# === 3. Convert results to DataFrame ===
def results_to_dataframe(results):
    records = []

    for job in results:
        company = job.get("company")
        if isinstance(company, dict):
            company = company.get("display_name")

        location = job.get("location")
        if isinstance(location, dict):
            location = location.get("display_name")

        category = job.get("category")
        if isinstance(category, dict):
            category = category.get("label")

        records.append({
            "title": job.get("title"),
            "company": company,
            "location": location,
            "salary_min": job.get("salary_min"),
            "salary_max": job.get("salary_max"),
            "category": category,
            "date_posted": job.get("created"),
            "url": job.get("redirect_url"),
            "description": job.get("description")
        })

    return pd.DataFrame(records)


# === 4. Run the job fetch + convert to DataFrame ===
countries = ["sg", "au"]   # to include countries align to both common country code agreed in  and naming convention Adzuna  "uk", "us", "au", etc.

all_results = []

for country in countries:
    print(f"Fetching jobs for {country.upper()}")
    results = fetch_adzuna_jobs(country_code=country, pages=5)
    all_results.extend(results)

job_df = results_to_dataframe(all_results)


# Show dataset
job_df.head()


Fetching jobs for SG
Fetched 20 items from page 1
Fetched 20 items from page 2
Fetched 20 items from page 3
Fetched 20 items from page 4
Fetched 20 items from page 5
Fetching jobs for AU
Fetched 20 items from page 1
Fetched 20 items from page 2
Fetched 20 items from page 3
Fetched 20 items from page 4
Fetched 20 items from page 5


Unnamed: 0,title,company,location,salary_min,salary_max,category,date_posted,url,description
0,UNPAID VOLUNTEER - Deputy Director of Learning...,Blockchain & Climate Institute/ BCI America Inc.,Singapore,,,Consultancy Jobs,2025-09-22T20:46:33Z,https://www.adzuna.sg/details/5410578355?utm_m...,Role Title : Deputy Director of Capacity Build...
1,Senior Principal- Telco,Infosys Singapore & Australia,"Central, Singapore",,,Consultancy Jobs,2023-09-29T20:08:52Z,https://www.adzuna.sg/details/4339263959?utm_m...,Infosys Consulting is the management consultin...
2,Digital Marketing Specialist,Tembusu Institute Pte. Ltd.,Singapore,33600.0,42000.0,"PR, Advertising & Marketing Jobs",2025-10-28T21:56:19Z,https://www.adzuna.sg/details/5468182210?utm_m...,Tembusu Institute Pte. Ltd. is looking to hire...
3,QAQC Engineer - Trackwork/Railway - URGENT,Talent Trader Group,"Central, Singapore",48000.0,55000.0,Engineering Jobs,2024-03-06T15:19:56Z,https://www.adzuna.sg/details/4595025158?utm_m...,Responsibilities: Plan inspection programs and...
4,Junior Sous Chef,Capella Singapore,Singapore,,,Hospitality & Catering Jobs,2025-11-23T06:00:03Z,https://www.adzuna.sg/details/5509912199?utm_m...,Position Overview The Junior Sous Chef assist ...


In [5]:
print(job_df.shape)
job_df[["salary_min", "salary_max"]].agg(["min", "max"])

(200, 9)


Unnamed: 0,salary_min,salary_max
min,0.0,1.0
max,500000.0,300000.0


In [6]:
# Save jobs data 
job_df.to_csv("adzuna_jobs.csv", index=False)
print("Saved to adzuna_jobs.csv")


Saved to adzuna_jobs.csv


In [7]:
# Load the raw data you crawled
df = pd.read_csv("adzuna_jobs.csv")

print("Raw DF loaded. Shape:", df.shape)
df.head()


Raw DF loaded. Shape: (200, 9)


Unnamed: 0,title,company,location,salary_min,salary_max,category,date_posted,url,description
0,UNPAID VOLUNTEER - Deputy Director of Learning...,Blockchain & Climate Institute/ BCI America Inc.,Singapore,,,Consultancy Jobs,2025-09-22T20:46:33Z,https://www.adzuna.sg/details/5410578355?utm_m...,Role Title : Deputy Director of Capacity Build...
1,Senior Principal- Telco,Infosys Singapore & Australia,"Central, Singapore",,,Consultancy Jobs,2023-09-29T20:08:52Z,https://www.adzuna.sg/details/4339263959?utm_m...,Infosys Consulting is the management consultin...
2,Digital Marketing Specialist,Tembusu Institute Pte. Ltd.,Singapore,33600.0,42000.0,"PR, Advertising & Marketing Jobs",2025-10-28T21:56:19Z,https://www.adzuna.sg/details/5468182210?utm_m...,Tembusu Institute Pte. Ltd. is looking to hire...
3,QAQC Engineer - Trackwork/Railway - URGENT,Talent Trader Group,"Central, Singapore",48000.0,55000.0,Engineering Jobs,2024-03-06T15:19:56Z,https://www.adzuna.sg/details/4595025158?utm_m...,Responsibilities: Plan inspection programs and...
4,Junior Sous Chef,Capella Singapore,Singapore,,,Hospitality & Catering Jobs,2025-11-23T06:00:03Z,https://www.adzuna.sg/details/5509912199?utm_m...,Position Overview The Junior Sous Chef assist ...


In [8]:
df[["salary_min", "salary_max"]].agg(["min", "max"])

Unnamed: 0,salary_min,salary_max
min,0.0,1.0
max,500000.0,300000.0


In [9]:
# ==============================================
# Cell 4 : Standardizing of the Salary data to annual format
# ==============================================

# -----------------------------
# 1. Function to infer frequency
# -----------------------------
def infer_frequency(value):
    """
    Infer whether a salary value is monthly or annual.
    Assumes monthly salaries are typically < 20,000 
    and annual salaries are typically >= 20,000.
    """
    if pd.isna(value):
        return "annual"
    return "monthly" if value < 20000 else "annual"


# -----------------------------
# 2. Function to convert to annual
# -----------------------------
def to_annual(value, frequency):
    """
    Convert monthly salary to annual.
    If already annual, return unchanged.
    """
    if pd.isna(value):
        return np.nan
    return value * 12 if frequency == "monthly" else value


# ----------------------------------------
# Apply frequency inference to both columns
# ----------------------------------------
df["frequency_min"] = df["salary_min"].apply(infer_frequency)
df["frequency_max"] = df["salary_max"].apply(infer_frequency)

# ----------------------------------------
# Convert salary_min and salary_max to annual
# ----------------------------------------
df["salary_min"] = df.apply(
    lambda row: to_annual(row["salary_min"], row["frequency_min"]), axis=1
)

df["salary_max"] = df.apply(
    lambda row: to_annual(row["salary_max"], row["frequency_max"]), axis=1
)

# ----------------------------------------
# Drop temp columns
# ----------------------------------------
df.drop(columns=["frequency_min", "frequency_max"], inplace=True)

# Optional: preview
df.head()
df[["salary_min", "salary_max"]].agg(["min", "max"])

Unnamed: 0,salary_min,salary_max
min,0.0,12.0
max,500000.0,300000.0


In [10]:
# ==============================================
# Cell 5 : Standardized cleaning function for company names
# ==============================================

def clean_text_column(name):
    if not isinstance(name, str):
        return name

    # Lowercase and strip
    x = name.strip().lower()

    # Remove extra punctuation
    x = re.sub(r"[^a-z0-9\s]", " ", x)

    # Collapse multiple spaces
    x = " ".join(x.split())

    # Optional: remove trailing words like "pte ltd", "ltd", "solutions" if used in lookup rules
    remove_terms = [
        r"\bpte ltd\b",
        r"\bltd\b",
        r"\bprivate limited\b",
        r"\bplc\b",
        r"\bllc\b",
        r"\binc\b",
        r"\bltd\b"      
    ]

    for term in remove_terms:
        x = re.sub(term, "", x) 

    # Fix leftover "amp 038" to &
    x = x.replace("amp 038", "&")

    # Trim after removal
    x = " ".join(x.split())

    # Title-case after cleaning
    x = x.title()

    return x

# Apply standardize cleaning function to company name column
clean_df = df.copy()
clean_df["company"] = df["company"].apply(clean_text_column)
clean_df.head()

Unnamed: 0,title,company,location,salary_min,salary_max,category,date_posted,url,description
0,UNPAID VOLUNTEER - Deputy Director of Learning...,Blockchain Climate Institute Bci America,Singapore,,,Consultancy Jobs,2025-09-22T20:46:33Z,https://www.adzuna.sg/details/5410578355?utm_m...,Role Title : Deputy Director of Capacity Build...
1,Senior Principal- Telco,Infosys Singapore Australia,"Central, Singapore",,,Consultancy Jobs,2023-09-29T20:08:52Z,https://www.adzuna.sg/details/4339263959?utm_m...,Infosys Consulting is the management consultin...
2,Digital Marketing Specialist,Tembusu Institute,Singapore,33600.0,42000.0,"PR, Advertising & Marketing Jobs",2025-10-28T21:56:19Z,https://www.adzuna.sg/details/5468182210?utm_m...,Tembusu Institute Pte. Ltd. is looking to hire...
3,QAQC Engineer - Trackwork/Railway - URGENT,Talent Trader Group,"Central, Singapore",48000.0,55000.0,Engineering Jobs,2024-03-06T15:19:56Z,https://www.adzuna.sg/details/4595025158?utm_m...,Responsibilities: Plan inspection programs and...
4,Junior Sous Chef,Capella Singapore,Singapore,,,Hospitality & Catering Jobs,2025-11-23T06:00:03Z,https://www.adzuna.sg/details/5509912199?utm_m...,Position Overview The Junior Sous Chef assist ...


In [11]:
# ==============================================
# Cell 6 : Reference country table to match and input country code
# ==============================================

# Load dim_countries table from Postgres, standardized list of countries
query = "SELECT country_name, country_code FROM dim_countries"
dim_countries = pd.read_sql(query, OL_engine)

# Specific cleaning of job locations based on dataset
clean_df["location"] = clean_df["location"].apply(
    lambda x: "Singapore" if "singapore" in str(x).lower() else "Australia"
)

# Option A: Using merge (safe for large datasets)
clean_df_2 = clean_df.merge(
    dim_countries.rename(columns={"country_name": "country_name_ref", "country_code": "country_code_ref"}),
    how="left",
    left_on="location",
    right_on="country_name_ref"
)
# Option B: Using map (faster for smaller datasets)
# country_map = dict(zip(dim_countries['country_name'], dim_countries['country_code']))
# df['country_code'] = df['location_geo'].map(country_map)

# -----------------------------
# Save rows where country_code_ref is NaN for further processing and remove them 
# -----------------------------
failed_rows = clean_df_2[clean_df_2["country_code_ref"].isna()]

if not failed_rows.empty:
    failed_rows.to_csv("country_update_fail.csv", index=False)
    print(f"{len(failed_rows)} rows failed to map a country code and were saved to 'country_update_fail.csv'.")
else:
    print("No rows failed to map a country code.")

# Remove those rows from the main DataFrame
clean_df_2 = clean_df_2[clean_df_2["country_code_ref"].notna()]

# Retain only required columns
clean_df_2 = clean_df_2[[
    "title", "company", "salary_min", "salary_max", "description", 
    "location", "country_code_ref"
]]

clean_df_2.head()

No rows failed to map a country code.


Unnamed: 0,title,company,salary_min,salary_max,description,location,country_code_ref
0,UNPAID VOLUNTEER - Deputy Director of Learning...,Blockchain Climate Institute Bci America,,,Role Title : Deputy Director of Capacity Build...,Singapore,SG
1,Senior Principal- Telco,Infosys Singapore Australia,,,Infosys Consulting is the management consultin...,Singapore,SG
2,Digital Marketing Specialist,Tembusu Institute,33600.0,42000.0,Tembusu Institute Pte. Ltd. is looking to hire...,Singapore,SG
3,QAQC Engineer - Trackwork/Railway - URGENT,Talent Trader Group,48000.0,55000.0,Responsibilities: Plan inspection programs and...,Singapore,SG
4,Junior Sous Chef,Capella Singapore,,,Position Overview The Junior Sous Chef assist ...,Singapore,SG


In [12]:
# ==============================================
# Cell 7 : Create index column for reverse tracebility
# ==============================================
clean_df_2["jobid"] = "MAR_" + (clean_df_2.reset_index().index + 1).astype(str).str.zfill(6)
clean_df_2.head()

Unnamed: 0,title,company,salary_min,salary_max,description,location,country_code_ref,jobid
0,UNPAID VOLUNTEER - Deputy Director of Learning...,Blockchain Climate Institute Bci America,,,Role Title : Deputy Director of Capacity Build...,Singapore,SG,MAR_000001
1,Senior Principal- Telco,Infosys Singapore Australia,,,Infosys Consulting is the management consultin...,Singapore,SG,MAR_000002
2,Digital Marketing Specialist,Tembusu Institute,33600.0,42000.0,Tembusu Institute Pte. Ltd. is looking to hire...,Singapore,SG,MAR_000003
3,QAQC Engineer - Trackwork/Railway - URGENT,Talent Trader Group,48000.0,55000.0,Responsibilities: Plan inspection programs and...,Singapore,SG,MAR_000004
4,Junior Sous Chef,Capella Singapore,,,Position Overview The Junior Sous Chef assist ...,Singapore,SG,MAR_000005


In [13]:
# ==============================================
# Cell 8 : Find distinct company name to prep for final table and update
# Rows with existing company name in dim_companies to be skipped
# ==============================================

# 1. Get existing companies from dim_companies
existing_companies_query = 'SELECT "company_id", "company_name" FROM dim_companies'
dim_companies_df = pd.read_sql(existing_companies_query, L_engine)
dim_companies_df["company_name"] = dim_companies_df["company_name"].str.strip()
existing_set = set(dim_companies_df["company_name"])

# --------------------------------------------------
# 2. Prepare new companies from clean_df_2
df_to_insert = clean_df_2[["company", "country_code_ref"]].rename(
    columns={"company": "company_name", "country_code_ref": "country_code"}
)

# Remove duplicates within the new data
df_to_insert = df_to_insert.drop_duplicates(subset=["company_name"])

# Keep only companies not already in dim_companies
df_to_insert = df_to_insert[~df_to_insert["company_name"].str.strip().isin(existing_set)]

# --------------------------------------------------
# 3. Insert new companies into dim_companies
if not df_to_insert.empty:
    df_to_insert.to_sql(
        "dim_companies",
    L_engine,
        if_exists="append",
        index=False
    )
    print(f"{len(df_to_insert)} new companies inserted.")
else:
    print("No new companies to insert.")

# --------------------------------------------------
# 4. Reload dim_companies to get all company_ids
dim_companies_df = pd.read_sql("SELECT company_id, company_name FROM dim_companies", L_engine)
dim_companies_df["company_name"] = dim_companies_df["company_name"].str.strip()

# --------------------------------------------------
# 5. Map company_id back into clean_df_2
clean_df_2 = clean_df_2.merge(
    dim_companies_df,
    how="left",
    left_on="company",
    right_on="company_name"
)

# Drop extra company_name column from merge
clean_df_2 = clean_df_2.drop(columns=["company_name"])

print("Company IDs mapped back into clean_df_2")

# Drop extra columns
clean_df_2 = clean_df_2.drop(["company", "location", "country_code_ref"], axis=1)
clean_df_2.head()

143 new companies inserted.
Company IDs mapped back into clean_df_2


Unnamed: 0,title,salary_min,salary_max,description,jobid,company_id
0,UNPAID VOLUNTEER - Deputy Director of Learning...,,,Role Title : Deputy Director of Capacity Build...,MAR_000001,4074
1,Senior Principal- Telco,,,Infosys Consulting is the management consultin...,MAR_000002,4075
2,Digital Marketing Specialist,33600.0,42000.0,Tembusu Institute Pte. Ltd. is looking to hire...,MAR_000003,4076
3,QAQC Engineer - Trackwork/Railway - URGENT,48000.0,55000.0,Responsibilities: Plan inspection programs and...,MAR_000004,4077
4,Junior Sous Chef,,,Position Overview The Junior Sous Chef assist ...,MAR_000005,4078


In [14]:
# ==============================================
# Cell 13 : check remote_job_id if exist in fact_jobs, if so update not append
# insert all other rows. 
# ==============================================

# Convert float data type in the below 2 columns to int
clean_df_2['salary_min'] = clean_df_2['salary_min'].astype('Int64')
clean_df_2['salary_max'] = clean_df_2['salary_max'].astype('Int64')

# Map clean_df_2 columns to fact_jobs columns
df_upsert = clean_df_2.rename(columns={
    "jobid": "joblistingid",
    "title": "job_title",
    "company_id": "company_id",
    "salary_min": "salary_min",
    "salary_max": "salary_max",
    "description": "description"
})

# Define SQL template
sql_template = """
INSERT INTO fact_jobs (joblistingid, company_id, job_title, salary_min, salary_max, description)
VALUES (:joblistingid, :company_id, :job_title, :salary_min, :salary_max, :description)
ON CONFLICT (joblistingid) 
DO UPDATE SET
    company_id = EXCLUDED.company_id,
    job_title = EXCLUDED.job_title,
    salary_min = EXCLUDED.salary_min,
    salary_max = EXCLUDED.salary_max,
    description = EXCLUDED.description;
"""

# Execute upsert row by row (corrected for SQLAlchemy 2.x)
with L_engine.begin() as conn:  # begin() handles commit automatically
    for _, row in df_upsert.iterrows():
        conn.execute(text(sql_template), row.to_dict())

print(f"{len(df_upsert)} rows processed with upsert into fact_jobs.")


200 rows processed with upsert into fact_jobs.
