In [1]:
import pandas as pd
import regex as re

In [2]:
df = pd.read_excel("NZ_Admin_JOBS.xlsx")

In [3]:
# Rename columns
new_column_names = ["position", "url", "company", "location", "elapsed_time_since_posted", "classification"]
df.columns = new_column_names

In [4]:
# Position Capitalised
df["position"] = df["position"].str.title()

In [5]:
dim = df.shape
print(dim)

(2708, 6)


In [6]:
# Check unique URLs
print(df["url"].nunique())

2708


In [7]:
def extract_and_clean(row):
    # Function to deduplicate by splitting the string in the middle
    def deduplicate(s):
        middle = len(s) // 2
        return s[:middle].strip()

    # Split the string at the comma, if present
    parts = row.split(',', 1)
    location_area_part = parts[0]
    benefit_part = parts[1].strip() if len(parts) > 1 else pd.NA

    # Check if "area:" is in the location_area_part
    if "area:" in location_area_part:
        # Find the index where "area:" starts
        area_index = location_area_part.find("area:")
        
        # Split the string into location and area parts
        location_part = location_area_part[:area_index]
        area_part = location_area_part[area_index:]
    else:
        # If "area:" is not present, the entire string is treated as the location part
        location_part = location_area_part
        area_part = ""

    # Deduplicate and clean location and area
    location = deduplicate(location_part.replace("location:", "").strip())
    area = deduplicate(area_part.replace("area:", "").strip()) if area_part else pd.NA

    return pd.Series([location, area, benefit_part])

# Assuming df is your DataFrame with a column named 'location'
df[['location', 'area', 'benefit']] = df['location'].apply(extract_and_clean)


In [8]:
def extract_classification(row):
    # Function to deduplicate by splitting the string in the middle
    def deduplicate(s):
        middle = len(s) // 2
        return s[:middle].strip()

    # Check if "classification:" is in the string
    if "classification:" in row:
        # Split the string into classification and subClassification parts
        parts = row.split("subClassification:")
        classification_part = parts[0]
        subClassification_part = parts[1] if len(parts) > 1 else pd.NA
    else:
        # If "classification:" is not present, set both parts to NA
        return pd.Series([pd.NA, pd.NA])

    # Deduplicate and clean classification and subClassification
    classification = deduplicate(classification_part.replace("classification:", "").strip())
    subClassification = deduplicate(subClassification_part.replace("subClassification:", "").strip()) if subClassification_part else pd.NA

    return pd.Series([classification, subClassification])

# Assuming df is your DataFrame with a column named 'classification'
df[['classification', 'sub_classification']] = df['classification'].apply(extract_classification)

In [9]:
print(df.head())

                               position  \
0                         Administrator   
1                          Receptionist   
2          Prosecutions Support Officer   
3  Early Childhood Centre Administrator   
4        Business Support Administrator   

                                                 url  \
0  https://www.seek.co.nz/job/50582301?type=promo...   
1  https://www.seek.co.nz/job/50620889?type=promo...   
2  https://www.seek.co.nz/job/50622169?type=stand...   
3  https://www.seek.co.nz/job/50639620?type=stand...   
4  https://www.seek.co.nz/job/50622432?type=stand...   

                                    company       location  \
0                                       NaN  Bay of Plenty   
1                      Avenues Orthodontics  Bay of Plenty   
2                        New Zealand Police       Auckland   
3  Kew Pacific Island Early Learning Centre      Southland   
4                                       NaN     Canterbury   

        elapsed_time_since_post