In [1]:
import pandas as pd
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ============ Setup logging ============
# Setup simple logging for demonstration
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Path to the directory containing processed parquet files
data_dir = Path("./data/company_data_processed")
parquet_files_processed_RAW = list(data_dir.glob("*.parquet"))
print(f"Using {len(parquet_files_processed_RAW)} parquet files for processing.")

# Constants
SENIORITY_GROUPS = [[1], [2], [3], [4], [5, 6, 7]]
OUTPUT_DIR = Path("./data/seniority_DWA_data_RAW")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Required columns for consistency
COLS = ["firm_id", "dwa_id", "month", "seniority", "FTE"]

Using 4092 parquet files for processing.


# Merging datasets on Seniority Levels

In [2]:
def process_seniority_data(file_list):
    # 1. Initialize buckets to hold dataframes for each seniority group
    buckets = {tuple(level): [] for level in SENIORITY_GROUPS}

    logger.info(f"Starting processing of {len(file_list)} files...")

    # 2. Iterate through files ONCE (I/O Heavy Step)
    for file_path in file_list:
        try:
            # Read the full file
            df = pd.read_parquet(file_path)
            
            # Distribute data to the appropriate bucket
            for level in SENIORITY_GROUPS:
                # Filter in memory
                subset = df[df["seniority"].isin(level)]
                
                if not subset.empty:
                    buckets[tuple(level)].append(subset)
                    
        except Exception as e:
            logger.error(f"Failed to process {file_path}: {e}")

    # 3. Concatenate and Save (CPU Heavy Step)
    for level, frames in buckets.items():
        group_name = '_'.join(map(str, level))
        logger.info(f"Saving data for seniority level: {group_name}")

        if frames:
            full_df = pd.concat(frames, ignore_index=True)
        else:
            # Create empty DF with schema if no data found
            full_df = pd.DataFrame(columns=COLS)

        # === GROUP SENIORITY [5,6,7] into 5 "Seniority" ===
        if level == (5, 6, 7):
            # 1. Group by firm_id, dwa_id, month and sum FTE
            company_DWA_df = full_df.groupby(['firm_id', 'dwa_id', 'month'])['FTE'].sum().reset_index()
            # 2. Assign the new seniority value
            company_DWA_df["seniority"] = 5
            
            # Replace the original full_df with the aggregated one
            full_df = company_DWA_df
            
            # Update COLS to match the new structure before dropna
            # Note: 'seniority' is now the new, constant value, which aligns with the original 'COLS'
        # === END MODIFICATION ===

        # Drop NAs
        full_df = full_df.dropna()

        # Save to Parquet
        output_path = OUTPUT_DIR / f"seniority_{group_name}_data_RAW.parquet"
        full_df.to_parquet(
            output_path, 
            index=False, 
            compression="snappy"
        )

process_seniority_data(parquet_files_processed_RAW)

INFO:__main__:Starting processing of 4092 files...
INFO:__main__:Saving data for seniority level: 1
INFO:__main__:Saving data for seniority level: 2
INFO:__main__:Saving data for seniority level: 3
INFO:__main__:Saving data for seniority level: 4
INFO:__main__:Saving data for seniority level: 5_6_7


# Filtering Process

In [3]:
# Path to the directory containing processed parquet files
data_dir = Path("./data/seniority_DWA_data_RAW")
parquet_files_processed_RAW = list(data_dir.glob("*.parquet"))
print(f"Using {len(parquet_files_processed_RAW)} parquet files for processing.")

Using 5 parquet files for processing.


In [4]:
import pandas as pd
import logging
from pathlib import Path

# ---------------------------------------------------------
# 0. CONFIGURATION
# ---------------------------------------------------------
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

OUTPUT_DIR = Path("./data/seniority_DWA_data_CLEAN")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Groups
SENIORITY_GROUPS = [[1], [2], [3], [4], [5, 6, 7]]

# Filter Parameters
PRE_START = '2021-01-01'
PRE_END = '2022-10-01'
TAU_FIRMS = 100 # Minimum unique firms       
TAU_MASS_PCT = 0.01 # Minimum mass share percentage

# Map raw integer to group tuple key
SENIORITY_MAP = {}
for group in SENIORITY_GROUPS:
    for raw_val in group:
        SENIORITY_MAP[raw_val] = tuple(group)

def get_stats_structure():
    return {
        tuple(group): {'fte_sums': {}, 'firm_sets': {}} 
        for group in SENIORITY_GROUPS
    }

# ---------------------------------------------------------
# 1. PASS 1: INDEPENDENT VALIDITY CHECK
# ---------------------------------------------------------
def get_valid_ids_per_group(file_list, tau_firms=TAU_FIRMS, tau_mass_pct=TAU_MASS_PCT):
    logger.info("--- PASS 1: Calculating Independent Filters per Group ---")
    
    group_stats = get_stats_structure()
    
    # A. ACCUMULATE STATS
    for i, file_path in enumerate(file_list):
        try:
            # Load only needed columns
            df = pd.read_parquet(file_path, columns=["firm_id", "dwa_id", "month", "seniority", "FTE"])
            df['month'] = pd.to_datetime(df['month'])
            
            # Filter Pre-treatment
            df = df[(df['month'] >= PRE_START) & (df['month'] <= PRE_END)]
            
            if df.empty: continue

            df['group_key'] = df['seniority'].map(SENIORITY_MAP)
            
            # Aggregate stats per group
            for group_key, subset in df.groupby('group_key'):
                if group_key not in group_stats: continue
                
                # FTE Sums
                fte_counts = subset.groupby('dwa_id')['FTE'].sum()
                for dwa, fte in fte_counts.items():
                    group_stats[group_key]['fte_sums'][dwa] = group_stats[group_key]['fte_sums'].get(dwa, 0) + fte
                
                # Firm Counts (Set)
                firm_groups = subset.groupby('dwa_id')['firm_id'].unique()
                for dwa, firms in firm_groups.items():
                    if dwa not in group_stats[group_key]['firm_sets']:
                        group_stats[group_key]['firm_sets'][dwa] = set()
                    group_stats[group_key]['firm_sets'][dwa].update(firms)

        except Exception as e:
            logger.warning(f"Pass 1: Skipping {file_path}: {e}")

    # B. CALCULATE VALID LISTS
    valid_ids_dict = {}

    for group_key in SENIORITY_GROUPS:
        key = tuple(group_key)
        stats = group_stats[key]
        
        total_group_fte = sum(stats['fte_sums'].values())
        valid_set = set()
        
        # Check every task in this group
        all_dwa_ids = set(stats['fte_sums'].keys())
        
        for dwa in all_dwa_ids:
            task_fte = stats['fte_sums'][dwa]
            unique_firms = len(stats['firm_sets'].get(dwa, set()))
            
            mass_share = (task_fte / total_group_fte * 100) if total_group_fte > 0 else 0
            
            # APPLY CRITERIA
            if unique_firms >= tau_firms and mass_share >= tau_mass_pct:
                valid_set.add(dwa)
        
        valid_ids_dict[key] = valid_set
        logger.info(f"Group {key}: {len(valid_set)} valid tasks identified (Share > {tau_mass_pct}% & Firms > {tau_firms})")
    return valid_ids_dict

# ---------------------------------------------------------
# 2. PASS 2: GROUP-SPECIFIC FILTERING
# ---------------------------------------------------------
def process_with_group_masks(file_list, valid_ids_dict):
    logger.info("--- PASS 2: Applying Group-Specific Masks & Saving ---")
    
    # Buckets to hold the FINAL filtered dataframes
    buckets = {tuple(level): [] for level in SENIORITY_GROUPS}
    
    for file_path in file_list:
        try:
            # Load Full Data
            df = pd.read_parquet(file_path)
            
            # Split by seniority FIRST, then filter using the specific mask
            for level in SENIORITY_GROUPS:
                key = tuple(level)
                
                # 1. Isolate the Seniority Level
                subset = df[df["seniority"].isin(level)]
                
                if subset.empty:
                    continue

                # 2. Retrieve the Valid IDs for THIS level
                # If key missing (rare), default to empty set (drop all)
                valid_mask = valid_ids_dict.get(key, set())
                
                # 3. Apply the specific filter
                filtered_subset = subset[subset['dwa_id'].isin(valid_mask)]
                
                if not filtered_subset.empty:
                    buckets[key].append(filtered_subset)
                    
        except Exception as e:
            logger.error(f"Pass 2: Error processing {file_path}: {e}")

    # 3. SAVE RESULTS
    for level, frames in buckets.items():
        group_name = '_'.join(map(str, level))
        
        if frames:
            full_df = pd.concat(frames, ignore_index=True)
            full_df = full_df.dropna()
            
            out_path = OUTPUT_DIR / f"seniority_{group_name}_data.parquet"
            full_df.to_parquet(out_path, index=False, compression="snappy")
            logger.info(f"Saved: {out_path} (Rows: {len(full_df)})")
        else:
            logger.warning(f"No data saved for {group_name} (Bucket empty after filtering)")



In [5]:
# ---------------------------------------------------------
# EXECUTION
valid_map = get_valid_ids_per_group(parquet_files_processed_RAW)


print("-" * 40)
print("VALID TASKS PER SENIORITY LEVEL")
print(f"Criteria: >{TAU_FIRMS} firms AND >{TAU_MASS_PCT}% mass (Pre-treatment)")
print("-" * 40)

total_unique_tasks = set()

INFO:__main__:--- PASS 1: Calculating Independent Filters per Group ---
INFO:__main__:Group (1,): 954 valid tasks identified (Share > 0.01% & Firms > 100)
INFO:__main__:Group (2,): 995 valid tasks identified (Share > 0.01% & Firms > 100)
INFO:__main__:Group (3,): 863 valid tasks identified (Share > 0.01% & Firms > 100)
INFO:__main__:Group (4,): 807 valid tasks identified (Share > 0.01% & Firms > 100)
INFO:__main__:Group (5, 6, 7): 683 valid tasks identified (Share > 0.01% & Firms > 100)


----------------------------------------
VALID TASKS PER SENIORITY LEVEL
Criteria: >100 firms AND >0.01% mass (Pre-treatment)
----------------------------------------


In [7]:
# Iterate through the map to print counts
for group, valid_set in valid_map.items():
    # Format the tuple key (e.g., (5, 6, 7) -> "5_6_7") for cleaner display
    group_name = '_'.join(map(str, group))
    
    count = len(valid_set)
    total_unique_tasks.update(valid_set)
    
    print(f"Seniority Level {group_name:<7} : {count:>6,} valid tasks")

print("-" * 40)
print(f"Total Unique Tasks (Union)   : {len(total_unique_tasks):,}")
print("-" * 40)

process_with_group_masks(parquet_files_processed_RAW, valid_map)

INFO:__main__:--- PASS 2: Applying Group-Specific Masks & Saving ---


Seniority Level 1       :    954 valid tasks
Seniority Level 2       :    995 valid tasks
Seniority Level 3       :    863 valid tasks
Seniority Level 4       :    807 valid tasks
Seniority Level 5_6_7   :    683 valid tasks
----------------------------------------
Total Unique Tasks (Union)   : 1,244
----------------------------------------


INFO:__main__:Saved: data/seniority_DWA_data_CLEAN/seniority_1_data.parquet (Rows: 77852578)
INFO:__main__:Saved: data/seniority_DWA_data_CLEAN/seniority_2_data.parquet (Rows: 82441791)
INFO:__main__:Saved: data/seniority_DWA_data_CLEAN/seniority_3_data.parquet (Rows: 59299324)
INFO:__main__:Saved: data/seniority_DWA_data_CLEAN/seniority_4_data.parquet (Rows: 57444498)
INFO:__main__:Saved: data/seniority_DWA_data_CLEAN/seniority_5_6_7_data.parquet (Rows: 58256159)
