# Data
Combine data into one big data

In [2]:
# python -m venv Unemployment (Python Virtual Environment)
import os
import pandas as pd


In [24]:
# ================================
# Constant
# ================================
LIST_OF_PROVINCES = [
    "ACEH", "SUMATERA UTARA", "SUMATERA BARAT", "RIAU", "JAMBI",
    "SUMATERA SELATAN", "BENGKULU", "LAMPUNG", "KEP. BANGKA BELITUNG", "KEP. RIAU",
    "DKI JAKARTA", "JAWA BARAT", "JAWA TENGAH", "DI YOGYAKARTA", "JAWA TIMUR",
    "BANTEN", "BALI", "NUSA TENGGARA BARAT", "NUSA TENGGARA TIMUR",
    "KALIMANTAN BARAT", "KALIMANTAN TENGAH", "KALIMANTAN SELATAN", "KALIMANTAN TIMUR", "KALIMANTAN UTARA",
    "SULAWESI UTARA", "SULAWESI TENGAH", "SULAWESI SELATAN", "SULAWESI TENGGARA",
    "GORONTALO", "SULAWESI BARAT", "MALUKU", "MALUKU UTARA",
    "PAPUA BARAT", "PAPUA BARAT DAYA", "PAPUA", "PAPUA SELATAN", "PAPUA TENGAH", "PAPUA PEGUNUNGAN"
]

In [25]:
FOLDER_DATA = "./Data"
OUTPUT_FILE = "./Results/unemployment.csv"

In [27]:
def month_process(row):
    try:
        feb = float(row["Februari"]) if row["Februari"] != "-" else None
    except:
        feb = None
    try:
        agu = float(row["Agustus"]) if row["Agustus"] != "-" else None
    except:
        agu = None

    if feb is not None and agu is not None:
        return round(feb + agu, 2)  # The total is rounded to 2 decimal places.
    elif feb is not None:
        return feb
    elif agu is not None:
        return agu
    else:
        return ""

In [35]:
# ================================
# Function: Process a CSV file
# ================================
def proses_file_csv(year):
    file_path = os.path.join(FOLDER_DATA, f"{year}.csv")
    
    if not os.path.exists(file_path):
        print(f"[!] File {year}.csv not found. Skipping.")
        return None

    with open(file_path, encoding="utf-8") as f:
        lines = f.readlines()

    # Find the starting line of the provincial data.
    start_idx = next(
        (i for i, line in enumerate(lines) if any(prov in line for prov in LIST_OF_PROVINCES)),
        None
    )
    
    if start_idx is None:
        print(f"[!] The provincial data was not found in {year}.csv. Skipped.")
        return None

    # Read the file again as a DataFrame.
    df = pd.read_csv(
        file_path,
        skiprows=start_idx,
        names=["Province", "Februari", "Agustus", "Annual"],
        encoding="utf-8"
    )

    # Filter data to include only those within the province list.
    df = df[df["Province"].isin(LIST_OF_PROVINCES)].copy()
    df["Year"] = year
    df["Month"] = df.apply(month_process, axis=1)
    df["Annual"] = df["Annual"].apply(lambda x: x if str(x).strip() != "-" else "")
    
    return df[["Year", "Province", "Month", "Annual"]]

In [36]:
# ================================
# Main Program
# ================================
def main():
    all_data = []

    for year in range(2001, 2026):
        df = proses_file_csv(year)
        if df is not None:
            all_data.append(df)

    if not all_data:
        print("[!] No data has been successfully processed.")
        return

    # Combine all dataframes.
    final_results = pd.concat(all_data, ignore_index=True)

    # Please ensure that the output folder exists.
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)

    # Save to file
    final_results.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
    print(f"[✓] The combined file has been successfully saved at: {OUTPUT_FILE}")

In [37]:
# ================================
# Eksekusi
# ================================
if __name__ == "__main__":
    main()

[✓] The combined file has been successfully saved at: ./Results/unemployment.csv


In [39]:
# ================================
# Hasil Visualisasi
# ================================
new = pd.read_csv(OUTPUT_FILE)
new

Unnamed: 0,Year,Province,Month,Annual
0,2001,ACEH,,7.71
1,2001,SUMATERA UTARA,,9.09
2,2001,SUMATERA BARAT,,8.74
3,2001,RIAU,,6.43
4,2001,JAMBI,,5.61
...,...,...,...,...
945,2025,PAPUA BARAT DAYA,6.61,
946,2025,PAPUA,6.92,
947,2025,PAPUA SELATAN,4.90,
948,2025,PAPUA TENGAH,3.55,
