In [12]:
import pandas as pd
import numpy as np

In [13]:
# 1. Load Data
print("Loading raw datasets...")
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')
rev_df = pd.read_csv('../data/revenue_distribution_by_sector.csv')
env_df = pd.read_csv('../data/environmental_activities.csv')
sdg_df = pd.read_csv('../data/sustainable_development_goals.csv')

Loading raw datasets...


In [14]:
# 2. Feature Engineering: Sector Revenue
# Pivot: Turn rows of sectors into columns of % revenue
print("Pivoting Sector Data...")
sector_pivot = rev_df.pivot_table(
    index='entity_id', 
    columns='nace_level_1_code', 
    values='revenue_pct', 
    aggfunc='sum',
    fill_value=0
).add_prefix('sector_pct_')

Pivoting Sector Data...


In [15]:
# 3. Feature Engineering: Environmental Activities
# Aggregate score adjustments and count activities
print("Aggregating Environmental Activities...")
env_features = env_df.groupby('entity_id').agg(
    net_env_adjustment=('env_score_adjustment', 'sum'),
    activity_count=('activity_code', 'count')
)

Aggregating Environmental Activities...


In [16]:
# 4. Feature Engineering: SDGs
# Create binary flags for each SDG
print("Processing SDG Data...")
sdg_pivot = pd.crosstab(sdg_df['entity_id'], sdg_df['sdg_id']).add_prefix('sdg_')

Processing SDG Data...


In [17]:
# 5. Merge All Features
def merge_features(base_df):
    # Merge Sector
    df = base_df.merge(sector_pivot, on='entity_id', how='left')
    # Merge Env Activities
    df = df.merge(env_features, on='entity_id', how='left')
    # Merge SDGs
    df = df.merge(sdg_pivot, on='entity_id', how='left')
    
    # Fill NaNs (companies missing from side tables have 0 impact/activity)
    fill_cols = list(sector_pivot.columns) + list(env_features.columns) + list(sdg_pivot.columns)
    df[fill_cols] = df[fill_cols].fillna(0)
    return df

print("Merging features...")
train_processed = merge_features(train_df)
test_processed = merge_features(test_df)

Merging features...


In [11]:
# 6. Save Processed Data for Modeling Notebook
# We save to CSV so the next notebook can pick it up
print("Saving processed datasets...")
train_processed.to_csv('../data/train_processed.csv', index=False)
test_processed.to_csv('../data/test_processed.csv', index=False)
print("Success! Processed files saved to /data folder.")

Saving processed datasets...
Success! Processed files saved to /data folder.
