In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pyfixest.estimation import fepois
from tqdm import tqdm
import re

import os

PRE_START = '2021-01-01'
EXP_END = '2025-09-01'
REF_PERIOD = '2022-10-01'

#List seniority DWA folder
FILE_PATH = "./data/seniority_DWA_data_CLEAN/"
files_seniority_DWA = os.listdir(FILE_PATH)
files_seniority_DWA.sort()

In [2]:
def balance_panel(df):
    print("--- Starting Panel Balancing ---")
    
    # 1. Ensure Date Format
    df = df.copy()
    df['month'] = pd.to_datetime(df['month'])
    
    # 2. Define the Full Time Range 
    # This creates a list of 57 months
    all_months = pd.date_range(start=PRE_START, end=EXP_END, freq='MS')
    
    # 3. Identify Unique Entity Pairs (Firm + DWA + Seniority)
    # We include seniority so the new '0' rows inherit the correct seniority label
    unique_keys = df[['firm_id', 'dwa_id', 'seniority']].drop_duplicates()
    
    print(f"Unique Firm-Task pairs to balance: {len(unique_keys):,}")
    
    # 4. Create the 'Skeleton' (The Target Grid)
    # We use a cross-join to repeat every unique pair for every month
    # This results in: (Num_Pairs * 57) rows
    
    # Create a temporary key for the cross join
    unique_keys['_temp_key'] = 1
    months_df = pd.DataFrame({'month': all_months, '_temp_key': 1})
    
    # Cross join: This creates the perfect empty grid
    skeleton = pd.merge(unique_keys, months_df, on='_temp_key').drop('_temp_key', axis=1)
    
    print(f"Target balanced size: {len(skeleton):,} rows")
    
    # 5. Merge the Original Data onto the Skeleton
    # Use LEFT JOIN: Keep the skeleton, fill data where it exists
    balanced_df = pd.merge(
        skeleton, 
        df, 
        on=['firm_id', 'dwa_id', 'seniority', 'month'], 
        how='left'
    )
    
    # 6. Fill Missing FTEs with 0
    # Any row that existed in skeleton but not in df is now NaN. We make it 0.
    balanced_df['FTE'] = balanced_df['FTE'].fillna(0)
    
    # 7. Recalculate 'time_j' (required for the regression later)
    # Reference Date: Oct 2022
    ref_dt = pd.to_datetime('2022-10-01')
    balanced_df['time_j'] = (balanced_df['month'].dt.year - ref_dt.year) * 12 + \
                            (balanced_df['month'].dt.month - ref_dt.month)
    
    print("--- Balancing Complete ---")
    return balanced_df


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pyfixest.estimation import fepois
from tqdm import tqdm
import re

import os

# Define the output directory path
RESULTS_DIR = "./results"

# --- CHECK AND CREATE RESULTS FOLDER ---
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)
    print(f"Created results directory: {RESULTS_DIR}")
# ---------------------------------------

PRE_START = '2021-01-01'
EXP_END = '2025-09-01'
REF_PERIOD = '2022-10-01'

#List seniority DWA folder
FILE_PATH = "./data/seniority_DWA_data_CLEAN/"
files_seniority_DWA = os.listdir(FILE_PATH)
files_seniority_DWA.sort()

# ... (The balance_panel function definition goes here) ...

# %%
for file_path in files_seniority_DWA:

    company_DWA_df = pd.read_parquet(os.path.join(FILE_PATH, file_path))
    print(f"Processing file: {file_path}")

    # ==========================================
    # APPLY THE BALANCING
    # ==========================================

    company_DWA_df = balance_panel(company_DWA_df)
    print("-" * 40)
    print(f"Final Dataset Size: {len(company_DWA_df):,} rows")
    print(f"Final Unique Companies: {company_DWA_df['firm_id'].nunique():,}")
    print(f"Final Unique Tasks: {company_DWA_df['dwa_id'].nunique():,}")

    # ==========================================
    # REGRESSION PER TASK
    # ==========================================

    # ---------------------------------------------------------
    # 1. SETUP
    # ---------------------------------------------------------

    # Test on first 5 DWAs
    target_dwas = company_DWA_df['dwa_id'].unique()
    df_subset = company_DWA_df[company_DWA_df['dwa_id'].isin(target_dwas)].copy()

    # Feature Engineering
    df_subset['month'] = pd.to_datetime(df_subset['month'])
    df_subset['month_str'] = df_subset['month'].dt.strftime('%Y-%m-%d')
    df_subset['H_total'] = df_subset.groupby(['firm_id', 'seniority', 'month'])['FTE'].transform('sum')
    df_subset['Z_control'] = np.log(df_subset['H_total'] - df_subset['FTE'] + 1)


    # ---------------------------------------------------------
    # 2. ROBUST EXTRACTION LOOP
    # ---------------------------------------------------------

    # This Regex looks for "[T." followed by a date "2021-01-01" followed by "]"
    # It ignores the "base='2022-10-01'" part of the string.
    date_pattern = re.compile(r"\[T\.(\d{4}-\d{2}-\d{2})\]")

    results_list = []
    errors = []

    print("Starting regression loop...")
    grouped_iterator = df_subset.groupby(['dwa_id', 'seniority'])

    for (dwa_id, seniority_level), subset in tqdm(grouped_iterator):
        try:
            # Data Checks
            if subset['FTE'].sum() == 0 or subset['firm_id'].nunique() < 2:
                continue

            # Run PPML
            model = fepois(
                fml = f"FTE ~ Z_control + i(month_str, ref='{REF_PERIOD}') | firm_id",
                data = subset,
                vcov = "hetero"
            )

            # Extract
            coefs = model.coef()
            se = model.se()
            
            # Iterate through coefficients
            for name, beta_value in coefs.items():
                
                # Use Regex to capture the specific time dummy date
                match = date_pattern.search(name)
                
                if match:
                    # match.group(1) is the date string inside the brackets (e.g., '2021-01-01')
                    extracted_date = match.group(1)
                    std_err = se.get(name, np.nan)
                    
                    results_list.append({
                        'dwa_id': dwa_id,
                        'seniority': seniority_level,
                        'month': extracted_date,
                        'beta': beta_value,
                        'se': std_err
                    })
            
            # CRITICAL: Manually add the Reference Period (Beta = 0)
            # This ensures your plots don't have a gap at Oct 2022
            results_list.append({
                'dwa_id': dwa_id,
                'seniority': seniority_level,
                'month': REF_PERIOD,
                'beta': 0.0,
                'se': 0.0
            })

        except Exception as e:
            errors.append((dwa_id, seniority_level, str(e)))

    # ---------------------------------------------------------
    # 3. SAVE & INSPECT
    # ---------------------------------------------------------
    ts_results = pd.DataFrame(results_list)

    if not ts_results.empty:
        # Convert month to datetime for sorting
        ts_results['month'] = pd.to_datetime(ts_results['month'])
        ts_results = ts_results.sort_values(['dwa_id', 'seniority', 'month'])
        
        print("-" * 30)
        print(f"Extraction Complete. Rows: {len(ts_results)}")

        # Save to Parquet, using the predefined RESULTS_DIR
        output_path = os.path.join(RESULTS_DIR, f"regression_results_{file_path}")
        print(f"Saving results to Parquet: {output_path}")
        ts_results.to_parquet(output_path, index=False)

        print("\n" + "-"*30 + "\n")
        
    else:
        print("No results found. Check errors.")
        if errors:
            print(errors[0])

Created results directory: ./results
Processing file: seniority_1_data.parquet
--- Starting Panel Balancing ---
Unique Firm-Task pairs to balance: 1,645,523
Target balanced size: 93,794,811 rows
--- Balancing Complete ---
----------------------------------------
Final Dataset Size: 93,794,811 rows
Final Unique Companies: 3,922
Final Unique Tasks: 954
Starting regression loop...


100%|██████████| 954/954 [07:59<00:00,  1.99it/s]


------------------------------
Extraction Complete. Rows: 54378
Saving results to Parquet: ./results/regression_results_seniority_1_data.parquet

------------------------------

Processing file: seniority_2_data.parquet
--- Starting Panel Balancing ---
Unique Firm-Task pairs to balance: 1,710,958
Target balanced size: 97,524,606 rows
--- Balancing Complete ---
----------------------------------------
Final Dataset Size: 97,524,606 rows
Final Unique Companies: 3,920
Final Unique Tasks: 995
Starting regression loop...


100%|██████████| 995/995 [08:26<00:00,  1.97it/s]


------------------------------
Extraction Complete. Rows: 56715
Saving results to Parquet: ./results/regression_results_seniority_2_data.parquet

------------------------------

Processing file: seniority_3_data.parquet
--- Starting Panel Balancing ---
Unique Firm-Task pairs to balance: 1,304,275
Target balanced size: 74,343,675 rows
--- Balancing Complete ---
----------------------------------------
Final Dataset Size: 74,343,675 rows
Final Unique Companies: 3,841
Final Unique Tasks: 863
Starting regression loop...


100%|██████████| 863/863 [07:07<00:00,  2.02it/s]


------------------------------
Extraction Complete. Rows: 49191
Saving results to Parquet: ./results/regression_results_seniority_3_data.parquet

------------------------------

Processing file: seniority_4_data.parquet
--- Starting Panel Balancing ---
Unique Firm-Task pairs to balance: 1,230,921
Target balanced size: 70,162,497 rows
--- Balancing Complete ---
----------------------------------------
Final Dataset Size: 70,162,497 rows
Final Unique Companies: 3,827
Final Unique Tasks: 807
Starting regression loop...


100%|██████████| 807/807 [05:47<00:00,  2.32it/s]


------------------------------
Extraction Complete. Rows: 45999
Saving results to Parquet: ./results/regression_results_seniority_4_data.parquet

------------------------------

Processing file: seniority_5_6_7_data.parquet
--- Starting Panel Balancing ---
Unique Firm-Task pairs to balance: 1,203,695
Target balanced size: 68,610,615 rows
--- Balancing Complete ---
----------------------------------------
Final Dataset Size: 68,610,615 rows
Final Unique Companies: 3,908
Final Unique Tasks: 683
Starting regression loop...


100%|██████████| 683/683 [05:33<00:00,  2.05it/s]

------------------------------
Extraction Complete. Rows: 38931
Saving results to Parquet: ./results/regression_results_seniority_5_6_7_data.parquet

------------------------------




