In [54]:
# Imports: regex utilities, pandas, and pathlib for file paths
import re
import pandas as pd
from pathlib import Path

In [55]:
# Source Excel file path for public sector vacancies
file_path = Path('../data/Actual/raw/public_sector.xlsx')

In [56]:
# Load the raw worksheet into a DataFrame
#   - sheet_name: source tab
#   - header: first row contains column names
df = pd.read_excel(file_path, sheet_name="Data1", header=0)

In [57]:
# Drop the first 9 rows (metadata in the sheet) and reset the index
df = df.iloc[9:].reset_index(drop=True)

In [58]:
# Quick preview to confirm header and data alignment
df.head()

Unnamed: 0.1,Unnamed: 0,Job Vacancies ; Public ; New South Wales ;,Job Vacancies ; Public ; Victoria ;,Job Vacancies ; Public ; Queensland ;,Job Vacancies ; Public ; South Australia ;,Job Vacancies ; Public ; Western Australia ;,Job Vacancies ; Public ; Tasmania ;,Job Vacancies ; Public ; Northern Territory ;,Job Vacancies ; Public ; Australian Capital Territory ;,Job Vacancies ; Public ; Australia ;,...,Job Vacancies ; Public ; Australia ;.2,Standard Error of Job Vacancies ; Public ; New South Wales ;,Standard Error of Job Vacancies ; Public ; Victoria ;,Standard Error of Job Vacancies ; Public ; Queensland ;,Standard Error of Job Vacancies ; Public ; South Australia ;,Standard Error of Job Vacancies ; Public ; Western Australia ;,Standard Error of Job Vacancies ; Public ; Tasmania ;,Standard Error of Job Vacancies ; Public ; Northern Territory ;,Standard Error of Job Vacancies ; Public ; Australian Capital Territory ;,Standard Error of Job Vacancies ; Public ; Australia ;
0,1979-05-15 00:00:00,,,,,,,,,12.3,...,11.7,,,,,,,,,
1,1979-08-15 00:00:00,,,,,,,,,11.0,...,12.9,,,,,,,,,
2,1979-11-15 00:00:00,,,,,,,,,15.4,...,14.1,,,,,,,,,
3,1980-02-15 00:00:00,,,,,,,,,15.9,...,14.7,,,,,,,,,
4,1980-05-15 00:00:00,,,,,,,,,13.1,...,14.2,,,,,,,,,


In [59]:
# Inspect column names, dtypes, and non-null counts after trimming header rows
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 21 columns):
 #   Column                                                                       Non-Null Count  Dtype 
---  ------                                                                       --------------  ----- 
 0   Unnamed: 0                                                                   185 non-null    object
 1   Job Vacancies ;  Public ;  New South Wales ;                                 161 non-null    object
 2   Job Vacancies ;  Public ;  Victoria ;                                        161 non-null    object
 3   Job Vacancies ;  Public ;  Queensland ;                                      162 non-null    object
 4   Job Vacancies ;  Public ;  South Australia ;                                 162 non-null    object
 5   Job Vacancies ;  Public ;  Western Australia ;                               158 non-null    object
 6   Job Vacancies ;  Public ;  Tasmania ;             

In [60]:
# Rename the first column to a canonical "Date" and parse as datetime
time_col = df.columns[0]
df = df.rename(columns={time_col: "Date"})
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
# Validate that "Date" is datetime and review missingness after parsing
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 21 columns):
 #   Column                                                                       Non-Null Count  Dtype         
---  ------                                                                       --------------  -----         
 0   Date                                                                         185 non-null    datetime64[ns]
 1   Job Vacancies ;  Public ;  New South Wales ;                                 161 non-null    object        
 2   Job Vacancies ;  Public ;  Victoria ;                                        161 non-null    object        
 3   Job Vacancies ;  Public ;  Queensland ;                                      162 non-null    object        
 4   Job Vacancies ;  Public ;  South Australia ;                                 162 non-null    object        
 5   Job Vacancies ;  Public ;  Western Australia ;                               158 non-null    object

In [61]:
# Clean column names into consistent, concise identifiers
#  - Standard errors -> prefix with "SE_" and suffix "_Public"
#  - Job Vacancies -> "<State>_Public"; handle Trend/Seasonal special cases
#  - Otherwise keep name unchanged
# This simplifies downstream selection and plotting.
def clean_col(name: str):
    name = str(name).strip()
    if name.startswith("Standard Error of Job Vacancies"):
        m = re.search(r"Standard Error of Job Vacancies\s*;\s*Public\s*;\s*(.*?)\s*;", name)
        state = m.group(1) if m else name
        return f"SE_{state}_Public"
    if name.startswith("Job Vacancies"):
        if "Seasonally Adjusted" in name:
            return "Australia_Public_Seasonal"
        if "Trend" in name:
            return "Australia_Public_Trend"
        m = re.search(r"Job Vacancies\s*;\s*Public\s*;\s*(.*?)\s*;", name)
        state = m.group(1) if m else name
        return f"{state}_Public"
    return name

# Apply the cleaner and inspect the new schema
df = df.rename(columns={c: clean_col(c) for c in df.columns})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 21 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   Date                                    185 non-null    datetime64[ns]
 1   New South Wales_Public                  161 non-null    object        
 2   Victoria_Public                         161 non-null    object        
 3   Queensland_Public                       162 non-null    object        
 4   South Australia_Public                  162 non-null    object        
 5   Western Australia_Public                158 non-null    object        
 6   Tasmania_Public                         162 non-null    object        
 7   Northern Territory_Public               162 non-null    object        
 8   Australian Capital Territory_Public     162 non-null    object        
 9   Australia_Public                        180 non-null  

In [62]:
# Remove any duplicated columns that resulted from the raw sheet structure
# Keep the first occurrence and copy to avoid chained-assignment warnings
df = df.loc[:, ~df.columns.duplicated()].copy()
# Verify final columns and non-null counts
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185 entries, 0 to 184
Data columns (total 19 columns):
 #   Column                                  Non-Null Count  Dtype         
---  ------                                  --------------  -----         
 0   Date                                    185 non-null    datetime64[ns]
 1   New South Wales_Public                  161 non-null    object        
 2   Victoria_Public                         161 non-null    object        
 3   Queensland_Public                       162 non-null    object        
 4   South Australia_Public                  162 non-null    object        
 5   Western Australia_Public                158 non-null    object        
 6   Tasmania_Public                         162 non-null    object        
 7   Northern Territory_Public               162 non-null    object        
 8   Australian Capital Territory_Public     162 non-null    object        
 9   Australia_Public                        180 non-null  

In [63]:
# Filter to the analysis period starting Nov 1983 and reset the index
df = df[df["Date"] >= "1983-11-01"].reset_index(drop=True)
# Sanity-check after filtering
df.head()

Unnamed: 0,Date,New South Wales_Public,Victoria_Public,Queensland_Public,South Australia_Public,Western Australia_Public,Tasmania_Public,Northern Territory_Public,Australian Capital Territory_Public,Australia_Public,SE_New South Wales_Public,SE_Victoria_Public,SE_Queensland_Public,SE_South Australia_Public,SE_Western Australia_Public,SE_Tasmania_Public,SE_Northern Territory_Public,SE_Australian Capital Territory_Public,SE_Australia_Public
0,1983-11-15,3.6,2.7,1.0,0.6,0.8,0.4,0.2,0.9,10.2,,,,,,,,,
1,1984-02-15,3.7,2.9,1.4,0.6,0.0,0.6,0.2,1.0,10.4,,,,,,,,,
2,1984-05-15,3.5,2.5,1.3,0.6,0.4,0.4,0.2,1.0,9.9,,,,,,,,,
3,1984-08-15,5.1,2.2,1.3,0.8,1.1,0.5,0.3,1.0,12.3,,,,,,,,,
4,1984-11-15,,2.6,1.3,0.5,,0.5,0.3,1.3,14.1,,0.2,0.1,0.1,,0.1,0.1,0.1,2.1


In [64]:
# Split into two DataFrames:
#  - vac_df: vacancy values (non-SE columns)
#  - se_df: standard error columns with Date
se_cols = [c for c in df.columns if c.startswith("SE_")]
vac_cols =  [c for c in df.columns if c not in se_cols and c != "Series ID" and c != "Unit"]
vac_df = df[vac_cols].copy()
se_df = pd.concat([df[["Date"]], df[se_cols]], axis=1)

In [65]:
# Confirm schema for vacancy values (pre-conversion)
vac_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Date                                 167 non-null    datetime64[ns]
 1   New South Wales_Public               161 non-null    object        
 2   Victoria_Public                      161 non-null    object        
 3   Queensland_Public                    162 non-null    object        
 4   South Australia_Public               162 non-null    object        
 5   Western Australia_Public             158 non-null    object        
 6   Tasmania_Public                      162 non-null    object        
 7   Northern Territory_Public            162 non-null    object        
 8   Australian Capital Territory_Public  162 non-null    object        
 9   Australia_Public                     162 non-null    object        
dtypes: datetime64[

In [66]:
# Convert numeric-like columns from object to float for both frames
for frame in (vac_df, se_df):
    num_cols = frame.columns.drop("Date")
    for col in num_cols:
        frame[col] = pd.to_numeric(frame[col], errors="coerce")

In [67]:
# Save cleaned wide-form datasets to disk
vac_path = "../data/actual/preprocessed/public_vacancies_clean.csv"
se_path = "../data/actual/preprocessed/public_vacancies_standard_error.csv"
vac_df.to_csv(vac_path, index=False)
se_df.to_csv(se_path, index=False)

In [68]:
# Convert wide-form vacancies into a tidy long-form table
vac_long = vac_df.melt(id_vars="Date", var_name="Region", value_name="Vacancies_thousands")
# Persist long-form to CSV for downstream viz/analysis
vac_long_path = "../data/actual/preprocessed/public_vacancies_clean_long.csv"
vac_long.to_csv(vac_long_path, index=False)

In [69]:
# Reload the saved CSVs to verify outputs on disk
#  - data: wide-form vacancies
#  - se_data: standard errors (wide-form)
#  - data3: long-form vacancies
data = pd.read_csv(vac_path)
se_data = pd.read_csv(se_path)
data3 = pd.read_csv(vac_long_path)

In [70]:
# Baseline missingness summary (counts per column) before interpolation
data.isna().sum()

Date                                   0
New South Wales_Public                 6
Victoria_Public                        6
Queensland_Public                      5
South Australia_Public                 5
Western Australia_Public               9
Tasmania_Public                        5
Northern Territory_Public              5
Australian Capital Territory_Public    5
Australia_Public                       5
dtype: int64

In [71]:
# Column we require to be non-null while others are all null
keep_col = "Date"

# Build a boolean mask for rows where:
#  - every column EXCEPT `keep_col` is null
#  - and `keep_col` itself is non-null
mask = data.loc[:, data.columns != keep_col].isna().all(axis=1) & data[keep_col].notna()

# Preview the rows that match this condition (view only, no mutation)
rows = data.loc[mask]
rows

Unnamed: 0,Date,New South Wales_Public,Victoria_Public,Queensland_Public,South Australia_Public,Western Australia_Public,Tasmania_Public,Northern Territory_Public,Australian Capital Territory_Public,Australia_Public
99,2008-08-15,,,,,,,,,
100,2008-11-15,,,,,,,,,
101,2009-02-15,,,,,,,,,
102,2009-05-15,,,,,,,,,
103,2009-08-15,,,,,,,,,


In [72]:
# Reuse the mask to identify rows to drop
mask = data.loc[:, data.columns != keep_col].isna().all(axis=1) & df[keep_col].notna()

# Count rows that satisfy the condition (for logging)
num_to_drop = int(mask.sum())

# Keep only rows that DO NOT match the mask (this drops the flagged rows)
data = data.loc[~mask].copy()

print(f"Dropped {num_to_drop} rows; new shape: {df.shape}")

Dropped 5 rows; new shape: (167, 19)


In [73]:
# Column we require to be non-null while others are all null
keep_col = "Date"

# Build a boolean mask for rows where:
#  - every column EXCEPT `keep_col` is null
#  - and `keep_col` itself is non-null
mask = data.loc[:, data.columns != keep_col].isna().all(axis=1) & data[keep_col].notna()

# Preview the rows that match this condition (view only, no mutation)
rows = data.loc[mask]
rows

Unnamed: 0,Date,New South Wales_Public,Victoria_Public,Queensland_Public,South Australia_Public,Western Australia_Public,Tasmania_Public,Northern Territory_Public,Australian Capital Territory_Public,Australia_Public


In [74]:
data.isnull().sum()

Date                                   0
New South Wales_Public                 1
Victoria_Public                        1
Queensland_Public                      0
South Australia_Public                 0
Western Australia_Public               4
Tasmania_Public                        0
Northern Territory_Public              0
Australian Capital Territory_Public    0
Australia_Public                       0
dtype: int64

In [75]:
# Inspect any rows that still contain at least one null value
rows_with_any_nulls = data[data.isna().any(axis=1)]
rows_with_any_nulls

Unnamed: 0,Date,New South Wales_Public,Victoria_Public,Queensland_Public,South Australia_Public,Western Australia_Public,Tasmania_Public,Northern Territory_Public,Australian Capital Territory_Public,Australia_Public
4,1984-11-15,,2.6,1.3,0.5,,0.5,0.3,1.3,14.1
6,1985-05-15,5.5,3.3,1.4,0.5,,0.7,0.3,1.5,14.9
7,1985-08-15,6.7,3.9,1.1,0.8,,0.4,0.2,1.5,16.6
17,1988-02-15,7.8,,2.0,0.6,,0.5,0.3,1.6,17.6


In [76]:
# Attempt a simple linear interpolation over time for numeric gaps.
# Note: For object dtypes, call data.infer_objects(copy=False) before interpolating.
data = data.interpolate(method="linear", inplace=False)


  data = data.interpolate(method="linear", inplace=False)


In [77]:
data.isna().sum()

Date                                   0
New South Wales_Public                 0
Victoria_Public                        0
Queensland_Public                      0
South Australia_Public                 0
Western Australia_Public               0
Tasmania_Public                        0
Northern Territory_Public              0
Australian Capital Territory_Public    0
Australia_Public                       0
dtype: int64

In [78]:
data.to_csv(vac_path, index=False)