In [20]:
import pandas as pd
from pathlib import Path

from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd
import logging

# ============ Setup logging ============
log_dir = Path("./logs")
log_dir.mkdir(exist_ok=True)
log_file = log_dir / "processing.log"

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(threadName)s] %(levelname)s: %(message)s",
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler(),  # Also print to console
    ]
)
logger = logging.getLogger(__name__)

# Formating company data for regression

**Company DF**

- rcid               → firm_id
- user_id            → worker_id
- onet_code          → SOC occupation
- seniority          → seniority group
- startdate/enddate  → spells
- weight             → Revelio weight

**Needed table**

Job → DWA probability



## Expanding to get the timeseries

In [21]:
def expand_months(row):
    months = pd.date_range(row["startdate"], row["enddate"], freq="MS")
    return pd.DataFrame({
        "firm_id":   int(row["rcid"]),
        "onet_code": row["onet_code"],
        "seniority": int(row["seniority"]),
        "user_id":   int(row["user_id"]),
        "weight":    float(row["weight"]),
        "month":     months,
    })

START_DATE = pd.to_datetime("2021-01-01")

## Merging with task -> Job

In [22]:
# ============ Load processed/failed tracking ============
processed_file = Path("./logs/processed.txt")
failed_file = Path("./logs/failed.txt")

processed = set()
failed = set()

if processed_file.exists():
    with open(processed_file) as f:
        processed = set(line.strip() for line in f if line.strip())

if failed_file.exists():
    with open(failed_file) as f:
        failed = set(line.strip() for line in f if line.strip())

logger.info(f"Starting run: {len(processed)} already processed, {len(failed)} known failures")

# ============ Process function ============
def process_company_data(file_path, task_matrix):
    file_stem = file_path.stem
    try:
        df = pd.read_parquet(file_path, engine='pyarrow')
        df['startdate'] = pd.to_datetime(df['startdate'])
        df['enddate'] = pd.to_datetime(df['enddate']).fillna(pd.Timestamp.now())
        
        #Remove rows where both start and end date are before START_DATE
        df = df[~((df['startdate'] < START_DATE) & (df['enddate'] < START_DATE))]
        #Set start date to START_DATE if it is before and the end date is after
        df.loc[(df['startdate'] < START_DATE), 'startdate'] = START_DATE
        
        #Expand the data to monthly panel
        expanded = pd.concat(df.apply(expand_months, axis=1).tolist(), ignore_index=True)
        expanded = expanded.merge(task_matrix, left_on='onet_code', right_on='job_profile_id')
        
        panel = expanded.groupby(
            ['firm_id', 'dwa_id', 'month', 'seniority'],
            as_index=False
        )['FTE'].sum()
        
        output_dir = Path("./data/company_data_processed")
        output_dir.mkdir(parents=True, exist_ok=True)
        output_file = output_dir / f"processed_{file_stem}.parquet"
        panel.to_parquet(output_file, index=False, compression='snappy')
        
        logger.info(f"Successfully processed {file_stem}")
        return (file_stem, "success")
    
    except Exception as e:
        logger.error(f"Failed to process {file_stem}: {str(e)}", exc_info=True)
        return (file_stem, "failed")

# ============ Main processing loop ============
task_matrix = pd.read_csv('./data/job_DWA_FTE_distribution_30_0.csv')
task_matrix = task_matrix.dropna()
task_matrix = task_matrix[["job_profile_id", "dwa_id", "FTE"]]

data_dir = Path("./data/company_data")
parquet_files = list(data_dir.glob("*.parquet"))

# Filter to only unprocessed files (skip already done and known failures)
files_to_process = [
    f for f in parquet_files 
    if f.stem not in processed and f.stem not in failed
]

logger.info(f"Processing {len(files_to_process)} files (skipped {len(processed) + len(failed)} already done)")

# Process in parallel
futures_map = {}
with ThreadPoolExecutor(max_workers=4) as executor:
    for file_path in files_to_process[:5]:
        future = executor.submit(process_company_data, file_path, task_matrix)
        futures_map[future] = file_path.stem

    # Collect results as they complete
    for future in as_completed(futures_map):
        file_stem, status = future.result()
        
        if status == "success":
            processed.add(file_stem)
            with open(processed_file, "a") as f:
                f.write(f"{file_stem}\n")
        else:
            failed.add(file_stem)
            with open(failed_file, "a") as f:
                f.write(f"{file_stem}\n")

logger.info(f"Run complete: {len(processed)} successful, {len(failed)} failed")


2025-12-01 22:40:44,120 [MainThread] INFO: Starting run: 0 already processed, 0 known failures
2025-12-01 22:40:44,165 [MainThread] INFO: Processing 4092 files (skipped 0 already done)
2025-12-01 22:40:44,767 [ThreadPoolExecutor-18_2] INFO: Successfully processed 349767
2025-12-01 22:40:44,779 [ThreadPoolExecutor-18_3] INFO: Successfully processed 530946
2025-12-01 22:40:47,673 [ThreadPoolExecutor-18_1] INFO: Successfully processed 346494
2025-12-01 22:40:49,449 [ThreadPoolExecutor-18_0] INFO: Successfully processed 778963
2025-12-01 22:40:49,512 [ThreadPoolExecutor-18_2] INFO: Successfully processed 22144010
2025-12-01 22:40:49,527 [MainThread] INFO: Run complete: 5 successful, 0 failed


# Merging on one seniority level

In [23]:
data_dir = Path("./data/company_data_processed")
parquet_files_processed = list(data_dir.glob("*.parquet"))

In [24]:
parquet_files_processed[0]

PosixPath('data/company_data_processed/processed_22144010.parquet')

In [30]:
SENIORITY = 1

# Use list + pd.concat instead of deprecated DataFrame.append
frames = []
for file_path in parquet_files_processed:
    tmp = pd.read_parquet(file_path)
    tmp = tmp[tmp["seniority"] == SENIORITY]
    if not tmp.empty:
        frames.append(tmp)

if frames:
    full_df = pd.concat(frames, ignore_index=True)
else:
    full_df = pd.DataFrame(columns=["firm_id", "dwa_id", "month", "seniority", "FTE"])

full_df.reset_index(drop=True, inplace=True)

In [31]:
full_df

Unnamed: 0,firm_id,dwa_id,month,seniority,FTE
0,22144010,4.A.1.a.1.I01.D01,2021-01-01,1,0.432142
1,22144010,4.A.1.a.1.I01.D01,2021-02-01,1,0.432142
2,22144010,4.A.1.a.1.I01.D01,2021-03-01,1,0.576189
3,22144010,4.A.1.a.1.I01.D01,2021-04-01,1,0.576189
4,22144010,4.A.1.a.1.I01.D01,2021-05-01,1,0.576189
...,...,...,...,...,...
93599,530946,4.A.4.c.3.I07.D04,2025-08-01,1,0.072606
93600,530946,4.A.4.c.3.I07.D04,2025-09-01,1,0.072606
93601,530946,4.A.4.c.3.I07.D04,2025-10-01,1,0.072606
93602,530946,4.A.4.c.3.I07.D04,2025-11-01,1,0.072606
