In [1]:
"""
Create Target Variable Script
==============================
Creates a risk label target variable for your dataset based on 
available features or columns.

Part of: Policy Risk Inference from Simulated Reports
Author: William V. Fullerton
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set_style("whitegrid")


def explore_potential_targets(df):
    """Explore which columns could serve as target variable."""
    print("="*70)
    print("EXPLORING POTENTIAL TARGET VARIABLES")
    print("="*70)
    
    print(f"\nDataset has {len(df)} rows and {len(df.columns)} columns")
    print(f"\nAll columns: {list(df.columns)}")
    
    # Look for obvious target candidates
    potential_targets = []
    
    for col in df.columns:
        col_lower = col.lower()
        if any(keyword in col_lower for keyword in ['risk', 'label', 'target', 'class', 'severity']):
            potential_targets.append(col)
    
    if potential_targets:
        print(f"\n✓ Found potential target columns:")
        for col in potential_targets:
            print(f"  - {col}")
            print(f"    Unique values: {df[col].unique()[:10]}")
            print(f"    Value counts:\n{df[col].value_counts()}\n")
    else:
        print("\n⚠ No obvious target column found")
        print("\nWe'll need to create one based on your features or metadata")
    
    return potential_targets


def create_target_from_risk_features(df, threshold_percentile=90):
    """
    Create binary risk label based on risk features.
    
    Strategy: Reports with high risk feature values are labeled as "high risk"
    """
    print("\n" + "="*70)
    print("OPTION 1: CREATE TARGET FROM RISK FEATURES")
    print("="*70)
    
    # Get risk feature columns
    risk_features = [col for col in df.columns if col.startswith('risk_')]
    
    if not risk_features:
        print("ERROR: No risk features found")
        return df
    
    print(f"\nUsing risk features: {risk_features}")
    
    # Calculate composite risk score
    # Normalize each feature to 0-1 scale
    df_normalized = df.copy()
    for feature in risk_features:
        max_val = df[feature].max()
        if max_val > 0:
            df_normalized[feature + '_norm'] = df[feature] / max_val
        else:
            df_normalized[feature + '_norm'] = 0
    
    # Calculate average normalized risk score
    norm_features = [f + '_norm' for f in risk_features]
    df['composite_risk_score'] = df_normalized[norm_features].mean(axis=1)
    
    # Create binary label based on threshold
    threshold = df['composite_risk_score'].quantile(threshold_percentile / 100)
    df['risk_label'] = (df['composite_risk_score'] >= threshold).astype(int)
    
    print(f"\nComposite risk score statistics:")
    print(f"  Min:  {df['composite_risk_score'].min():.4f}")
    print(f"  Mean: {df['composite_risk_score'].mean():.4f}")
    print(f"  Max:  {df['composite_risk_score'].max():.4f}")
    print(f"  Threshold ({threshold_percentile}th percentile): {threshold:.4f}")
    
    print(f"\nRisk label distribution:")
    print(df['risk_label'].value_counts())
    print(f"\n  Low Risk (0):  {(df['risk_label']==0).sum()} ({100*(df['risk_label']==0).sum()/len(df):.1f}%)")
    print(f"  High Risk (1): {(df['risk_label']==1).sum()} ({100*(df['risk_label']==1).sum()/len(df):.1f}%)")
    
    return df


def create_target_from_sentiment(df):
    """
    Create risk label from sentiment if available.
    
    Strategy: Negative sentiment = higher risk
    """
    print("\n" + "="*70)
    print("OPTION 2: CREATE TARGET FROM SENTIMENT")
    print("="*70)
    
    if 'sentiment' not in df.columns:
        print("ERROR: No 'sentiment' column found")
        return df
    
    print(f"\nSentiment values: {df['sentiment'].unique()}")
    
    # Map sentiment to risk
    # Assuming negative sentiment indicates higher risk
    sentiment_map = {
        'negative': 1,
        'neutral': 0,
        'positive': 0
    }
    
    df['risk_label_from_sentiment'] = df['sentiment'].map(sentiment_map)
    
    if df['risk_label_from_sentiment'].isna().any():
        print(f"Warning: {df['risk_label_from_sentiment'].isna().sum()} unmapped sentiment values")
    
    print(f"\nRisk label from sentiment distribution:")
    print(df['risk_label_from_sentiment'].value_counts())
    
    return df


def create_target_from_load_factor(df, threshold=0.8):
    """
    Create risk label from load_factor if available.
    
    Strategy: High load factor = higher operational risk
    """
    print("\n" + "="*70)
    print("OPTION 3: CREATE TARGET FROM LOAD FACTOR")
    print("="*70)
    
    if 'load_factor' not in df.columns:
        print("ERROR: No 'load_factor' column found")
        return df
    
    print(f"\nLoad factor statistics:")
    print(f"  Min:  {df['load_factor'].min():.4f}")
    print(f"  Mean: {df['load_factor'].mean():.4f}")
    print(f"  Max:  {df['load_factor'].max():.4f}")
    
    df['risk_label_from_load'] = (df['load_factor'] >= threshold).astype(int)
    
    print(f"\nUsing threshold: {threshold}")
    print(f"Risk label from load factor distribution:")
    print(df['risk_label_from_load'].value_counts())
    
    return df


def create_synthetic_target(df, high_risk_pct=2):
    """
    Create synthetic risk labels for demonstration.
    
    Strategy: Use a combination of features to create realistic labels
    """
    print("\n" + "="*70)
    print("OPTION 4: CREATE SYNTHETIC TARGET (DEMONSTRATION)")
    print("="*70)
    
    print(f"\nCreating synthetic labels with ~{high_risk_pct}% high-risk cases")
    
    # Use risk features if available
    risk_features = [col for col in df.columns if col.startswith('risk_')]
    
    if risk_features:
        # Calculate risk score
        df['risk_score'] = df[risk_features].sum(axis=1)
        
        # Add some randomness
        np.random.seed(42)
        noise = np.random.normal(0, df['risk_score'].std() * 0.1, size=len(df))
        df['risk_score'] = df['risk_score'] + noise
        
        # Assign labels based on score percentile
        threshold = df['risk_score'].quantile(1 - high_risk_pct/100)
        df['risk_label_synthetic'] = (df['risk_score'] >= threshold).astype(int)
    else:
        # Pure random if no features
        np.random.seed(42)
        df['risk_label_synthetic'] = np.random.choice(
            [0, 1], 
            size=len(df), 
            p=[1 - high_risk_pct/100, high_risk_pct/100]
        )
    
    print(f"\nSynthetic risk label distribution:")
    print(df['risk_label_synthetic'].value_counts())
    
    return df


def visualize_target_options(df, output_dir='figures'):
    """Visualize different target variable options."""
    os.makedirs(output_dir, exist_ok=True)
    
    print("\n" + "="*70)
    print("CREATING VISUALIZATIONS")
    print("="*70)
    
    # Find all label columns
    label_cols = [col for col in df.columns if 'label' in col.lower()]
    
    if not label_cols:
        print("No label columns to visualize")
        return
    
    # Risk features
    risk_features = [col for col in df.columns if col.startswith('risk_') and 'label' not in col]
    
    if risk_features and label_cols:
        # Plot risk features by different label definitions
        for label_col in label_cols:
            if label_col in df.columns and df[label_col].notna().all():
                fig, axes = plt.subplots(2, 3, figsize=(15, 10))
                axes = axes.flatten()
                
                for idx, feature in enumerate(risk_features[:6]):
                    if idx < len(axes):
                        df.boxplot(column=feature, by=label_col, ax=axes[idx])
                        axes[idx].set_title(f'{feature}')
                        axes[idx].set_xlabel(f'{label_col}')
                
                plt.suptitle(f'Risk Features by {label_col}', y=1.02)
                plt.tight_layout()
                filename = f'{output_dir}/risk_features_by_{label_col}.png'
                plt.savefig(filename, dpi=300, bbox_inches='tight')
                print(f"  Saved: {filename}")
                plt.close()
    
    # Distribution comparison
    if len(label_cols) > 1:
        fig, axes = plt.subplots(1, len(label_cols), figsize=(5*len(label_cols), 4))
        if len(label_cols) == 1:
            axes = [axes]
        
        for idx, label_col in enumerate(label_cols):
            if label_col in df.columns:
                counts = df[label_col].value_counts()
                axes[idx].bar(counts.index, counts.values)
                axes[idx].set_title(f'{label_col}')
                axes[idx].set_xlabel('Class')
                axes[idx].set_ylabel('Count')
        
        plt.tight_layout()
        filename = f'{output_dir}/label_distributions_comparison.png'
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        print(f"  Saved: {filename}")
        plt.close()


def main():
    """Main execution function."""
    print("="*70)
    print("CREATE TARGET VARIABLE FOR RISK CLASSIFICATION")
    print("="*70)
    
    # Load your processed data
    data_path = 'data/processed/reports_with_features.csv'
    
    if not os.path.exists(data_path):
        print(f"\nERROR: File not found: {data_path}")
        print("Please run script 02 first to create this file")
        return
    
    print(f"\nLoading data from: {data_path}")
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} rows")
    
    # Explore existing columns
    potential_targets = explore_potential_targets(df)
    
    # Try different methods to create target
    print("\n" + "="*70)
    print("GENERATING TARGET VARIABLE OPTIONS")
    print("="*70)
    
    # Option 1: From risk features (RECOMMENDED)
    df = create_target_from_risk_features(df, threshold_percentile=90)
    
    # Option 2: From sentiment (if available)
    if 'sentiment' in df.columns:
        df = create_target_from_sentiment(df)
    
    # Option 3: From load factor (if available)
    if 'load_factor' in df.columns:
        df = create_target_from_load_factor(df, threshold=0.8)
    
    # Option 4: Synthetic (fallback)
    df = create_synthetic_target(df, high_risk_pct=2)
    
    # Visualize options
    visualize_target_options(df)
    
    # RECOMMENDATION
    print("\n" + "="*70)
    print("RECOMMENDATION")
    print("="*70)
    
    print("\nBased on your data, I recommend using: 'risk_label'")
    print("(Created from composite risk features)")
    print("\nThis approach:")
    print("  ✓ Uses your actual risk features")
    print("  ✓ Creates realistic separation between classes")
    print("  ✓ Maintains interpretability")
    
    # Save the dataset with the recommended label
    output_path = 'data/processed/reports_with_features_and_labels.csv'
    
    # Keep the main risk_label, remove others
    columns_to_keep = [col for col in df.columns if not (col.endswith('_norm') or col == 'risk_score')]
    df_final = df[columns_to_keep].copy()
    
    df_final.to_csv(output_path, index=False)
    print(f"\n✓ Saved dataset with labels to: {output_path}")
    
    print("\n" + "="*70)
    print("NEXT STEPS")
    print("="*70)
    print("\n1. Review the generated labels")
    print("2. If satisfied, update script 03 to use this file:")
    print(f"   data_path = '{output_path}'")
    print("3. Or manually rename the column you want to use as 'risk_label'")
    
    return df_final


if __name__ == "__main__":
    df = main()

CREATE TARGET VARIABLE FOR RISK CLASSIFICATION

Loading data from: data/processed/reports_with_features.csv
Loaded 3000 rows
EXPLORING POTENTIAL TARGET VARIABLES

Dataset has 3000 rows and 19 columns

All columns: ['id', 'timestamp', 'style', 'topic', 'sentiment', 'load_factor', 'agents', 'capacity', 'text', 'style_id', 'topic_id', 'sentiment_id', 'cleaned_text', 'risk_high_severity_count', 'risk_violation_count', 'risk_financial_count', 'risk_temporal_count', 'risk_density', 'text_length']

✓ Found potential target columns:
  - risk_high_severity_count
    Unique values: [0]
    Value counts:
risk_high_severity_count
0    3000
Name: count, dtype: int64

  - risk_violation_count
    Unique values: [0]
    Value counts:
risk_violation_count
0    3000
Name: count, dtype: int64

  - risk_financial_count
    Unique values: [0]
    Value counts:
risk_financial_count
0    3000
Name: count, dtype: int64

  - risk_temporal_count
    Unique values: [0]
    Value counts:
risk_temporal_count
0   