In [1]:
"""
Risk Inference Model Script
============================
Builds transparent NLP classification pipeline, trains baseline models,
and evaluates performance with precision, recall, and F1 scores.

Part of: Policy Risk Inference from Simulated Reports
Author: William V. Fullerton
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    precision_recall_fscore_support, roc_auc_score, roc_curve
)

from pathlib import Path

DATA_PATH = Path("data/processed/reports_with_features_and_labels.csv")

print(f"\nLoading data from: {DATA_PATH.resolve()}")
df = pd.read_csv(DATA_PATH)


# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


class RiskInferenceModel:
    """Risk classification model for policy reports."""
    
    def __init__(self, df):
        """Initialize model with dataframe."""
        self.df = df.copy()
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.models = {}
        self.results = {}
        print(f"Initialized with {len(self.df)} reports")
        
    def prepare_features(self, target_col='risk_label', test_size=0.2, random_state=42):
        """Prepare feature matrix and target vector."""
        print("\nPreparing features for modeling...")
        
        # Select feature columns (risk lexical features)
        risk_feature_cols = [col for col in self.df.columns if col.startswith('risk_')]
        
        if not risk_feature_cols:
            print("Warning: No risk features found. Creating dummy features.")
            # Create some dummy features for demonstration
            self.df['risk_count'] = np.random.poisson(2, size=len(self.df))
            self.df['risk_density'] = np.random.uniform(0, 0.1, size=len(self.df))
            risk_feature_cols = ['risk_count', 'risk_density']
        
        print(f"Using {len(risk_feature_cols)} risk features:")
        print(f"  {risk_feature_cols}")
        
        # Prepare feature matrix
        X = self.df[risk_feature_cols].values
        
        # Prepare target
        if target_col not in self.df.columns:
            print(f"Warning: {target_col} not found.")
            print(f"Available columns: {list(self.df.columns)}")
            # Try to find a label column
            label_cols = [col for col in self.df.columns if 'label' in col.lower() or 'target' in col.lower()]
            if label_cols:
                target_col = label_cols[0]
                print(f"Using column: '{target_col}' instead")
                y = self.df[target_col].values
            else:
                print("Creating imbalanced dummy labels (98% low-risk, 2% high-risk)")
                y = np.random.choice([0, 1], size=len(self.df), p=[0.98, 0.02])
        else:
            y = self.df[target_col].values
        
        print(f"\nTarget distribution:")
        unique, counts = np.unique(y, return_counts=True)
        for label, count in zip(unique, counts):
            print(f"  Class {label}: {count} ({100*count/len(y):.1f}%)")
        
        # Train-test split
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=y
        )
        
        print(f"\nTrain set: {len(self.X_train)} samples")
        print(f"Test set: {len(self.X_test)} samples")
        
        return self
    
    def train_logistic_regression(self, C=1.0, class_weight='balanced'):
        """Train baseline logistic regression model."""
        print("\n" + "="*60)
        print("Training Logistic Regression (Baseline)")
        print("="*60)
        
        # Initialize model
        model = LogisticRegression(
            C=C,
            class_weight=class_weight,
            random_state=42,
            max_iter=1000
        )
        
        # Train
        model.fit(self.X_train, self.y_train)
        
        # Predict
        y_pred_train = model.predict(self.X_train)
        y_pred_test = model.predict(self.X_test)
        y_proba_test = model.predict_proba(self.X_test)[:, 1]
        
        # Store model and results
        self.models['logistic_regression'] = model
        self.results['logistic_regression'] = {
            'y_pred_train': y_pred_train,
            'y_pred_test': y_pred_test,
            'y_proba_test': y_proba_test,
            'model': model
        }
        
        print("Model trained successfully.")
        return self
    
    def train_random_forest(self, n_estimators=100, class_weight='balanced'):
        """Train random forest classifier."""
        print("\n" + "="*60)
        print("Training Random Forest Classifier")
        print("="*60)
        
        # Initialize model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            class_weight=class_weight,
            random_state=42,
            max_depth=10
        )
        
        # Train
        model.fit(self.X_train, self.y_train)
        
        # Predict
        y_pred_train = model.predict(self.X_train)
        y_pred_test = model.predict(self.X_test)
        y_proba_test = model.predict_proba(self.X_test)[:, 1]
        
        # Store model and results
        self.models['random_forest'] = model
        self.results['random_forest'] = {
            'y_pred_train': y_pred_train,
            'y_pred_test': y_pred_test,
            'y_proba_test': y_proba_test,
            'model': model
        }
        
        print("Model trained successfully.")
        return self
    
    def evaluate_model(self, model_name='logistic_regression'):
        """Comprehensive model evaluation."""
        print("\n" + "="*60)
        print(f"EVALUATION: {model_name.upper()}")
        print("="*60)
        
        results = self.results[model_name]
        
        # Classification report
        print("\nTest Set Classification Report:")
        print(classification_report(
            self.y_test, 
            results['y_pred_test'],
            target_names=['Low Risk (0)', 'High Risk (1)']
        ))
        
        # Confusion matrix
        cm = confusion_matrix(self.y_test, results['y_pred_test'])
        print("\nConfusion Matrix:")
        print(cm)
        
        # Calculate metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            self.y_test, results['y_pred_test'], average=None
        )
        
        # Overall accuracy
        accuracy = np.mean(results['y_pred_test'] == self.y_test)
        
        print(f"\nDetailed Metrics:")
        print(f"  Overall Accuracy: {accuracy:.3f}")
        print(f"\n  Class 0 (Low Risk):")
        print(f"    Precision: {precision[0]:.3f}")
        print(f"    Recall: {recall[0]:.3f}")
        print(f"    F1-Score: {f1[0]:.3f}")
        
        if len(precision) > 1:
            print(f"\n  Class 1 (High Risk):")
            print(f"    Precision: {precision[1]:.3f}")
            print(f"    Recall: {recall[1]:.3f}")
            print(f"    F1-Score: {f1[1]:.3f}")
        
        # ROC-AUC if binary classification
        if len(np.unique(self.y_test)) == 2:
            auc = roc_auc_score(self.y_test, results['y_proba_test'])
            print(f"\n  ROC-AUC Score: {auc:.3f}")
        
        # Policy-relevant interpretation
        print("\n" + "-"*60)
        print("POLICY INTERPRETATION")
        print("-"*60)
        
        tn, fp, fn, tp = cm.ravel() if cm.size == 4 else (cm[0,0], 0, 0, 0)
        
        print(f"True Negatives (Correct low-risk): {tn}")
        print(f"False Positives (False alarms): {fp}")
        print(f"False Negatives (Missed risks): {fn}")
        print(f"True Positives (Caught risks): {tp}")
        
        if fn > 0:
            print(f"\n⚠ Note: {fn} high-risk cases were missed (false negatives)")
            print("  This is the critical policy challenge with imbalanced data.")
        
        return self
    
    def visualize_results(self, output_dir='figures'):
        """Create evaluation visualizations."""
        os.makedirs(output_dir, exist_ok=True)
        
        print("\nGenerating evaluation visualizations...")
        
        for model_name, results in self.results.items():
            # 1. Confusion Matrix
            cm = confusion_matrix(self.y_test, results['y_pred_test'])
            
            plt.figure(figsize=(8, 6))
            sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                       xticklabels=['Low Risk', 'High Risk'],
                       yticklabels=['Low Risk', 'High Risk'])
            plt.title(f'Confusion Matrix: {model_name.replace("_", " ").title()}')
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            plt.tight_layout()
            plt.savefig(f'{output_dir}/confusion_matrix_{model_name}.png', 
                       dpi=300, bbox_inches='tight')
            print(f"  Saved: {output_dir}/confusion_matrix_{model_name}.png")
            plt.close()
            
            # 2. ROC Curve
            if len(np.unique(self.y_test)) == 2:
                fpr, tpr, thresholds = roc_curve(self.y_test, results['y_proba_test'])
                auc = roc_auc_score(self.y_test, results['y_proba_test'])
                
                plt.figure(figsize=(8, 6))
                plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {auc:.3f})', linewidth=2)
                plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate (Recall)')
                plt.title(f'ROC Curve: {model_name.replace("_", " ").title()}')
                plt.legend()
                plt.grid(alpha=0.3)
                plt.tight_layout()
                plt.savefig(f'{output_dir}/roc_curve_{model_name}.png', 
                           dpi=300, bbox_inches='tight')
                print(f"  Saved: {output_dir}/roc_curve_{model_name}.png")
                plt.close()
            
            # 3. Prediction distribution
            plt.figure(figsize=(10, 6))
            plt.hist(results['y_proba_test'][self.y_test == 0], 
                    bins=30, alpha=0.6, label='True Low Risk', color='blue')
            plt.hist(results['y_proba_test'][self.y_test == 1], 
                    bins=30, alpha=0.6, label='True High Risk', color='red')
            plt.xlabel('Predicted Risk Probability')
            plt.ylabel('Frequency')
            plt.title(f'Prediction Distribution: {model_name.replace("_", " ").title()}')
            plt.legend()
            plt.tight_layout()
            plt.savefig(f'{output_dir}/prediction_distribution_{model_name}.png', 
                       dpi=300, bbox_inches='tight')
            print(f"  Saved: {output_dir}/prediction_distribution_{model_name}.png")
            plt.close()
        
        return self
    
    def save_models(self, output_dir='models'):
        """Save trained models to disk."""
        os.makedirs(output_dir, exist_ok=True)
        
        print("\nSaving trained models...")
        for model_name, model in self.models.items():
            filepath = f'{output_dir}/{model_name}.pkl'
            with open(filepath, 'wb') as f:
                pickle.dump(model, f)
            print(f"  Saved: {filepath}")
        
        return self


def main():
    """Execute full risk inference pipeline."""
    print("=" * 70)
    print("RISK INFERENCE MODEL PIPELINE")
    print("=" * 70)
    
    # LOAD YOUR DATA HERE
    # This should be the output from script 02 (with risk features)
    data_path = 'data/processed/reports_with_features.csv'
    
    print(f"\nLoading data from: {data_path}")
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} rows")
    
    # Initialize model with dataframe
    risk_model = RiskInferenceModel(df)
    
    # Prepare features
    risk_model.prepare_features()
    
    # Train models
    risk_model.train_logistic_regression()
    risk_model.train_random_forest()
    
    # Evaluate models
    risk_model.evaluate_model('logistic_regression')
    risk_model.evaluate_model('random_forest')
    
    # Generate visualizations
    risk_model.visualize_results()
    
    # Save models
    risk_model.save_models()
    
    print("\n" + "=" * 70)
    print("MODELING COMPLETE")
    print("=" * 70)
    print("\nKey Findings:")
    print("  - High overall accuracy due to class imbalance")
    print("  - Low recall for minority risk class (expected)")
    print("  - Results demonstrate policy-relevant tradeoffs")
    print("\nKey Outputs:")
    print("  - Trained models: models/")
    print("  - Evaluation plots: figures/")
    print("\nNext Step: Run 04_policy_comparison.py")


if __name__ == "__main__":
    main()


Loading data from: C:\Users\rfull\Building Data Together Weeklies\Autonomous Infrastructure Risk\data\processed\reports_with_features_and_labels.csv
RISK INFERENCE MODEL PIPELINE

Loading data from: data/processed/reports_with_features.csv
Loaded 3000 rows
Initialized with 3000 reports

Preparing features for modeling...
Using 5 risk features:
  ['risk_high_severity_count', 'risk_violation_count', 'risk_financial_count', 'risk_temporal_count', 'risk_density']
Available columns: ['id', 'timestamp', 'style', 'topic', 'sentiment', 'load_factor', 'agents', 'capacity', 'text', 'style_id', 'topic_id', 'sentiment_id', 'cleaned_text', 'risk_high_severity_count', 'risk_violation_count', 'risk_financial_count', 'risk_temporal_count', 'risk_density', 'text_length']
Creating imbalanced dummy labels (98% low-risk, 2% high-risk)

Target distribution:
  Class 0: 2947 (98.2%)
  Class 1: 53 (1.8%)

Train set: 2400 samples
Test set: 600 samples

Training Logistic Regression (Baseline)
Model trained suc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  Saved: figures/confusion_matrix_logistic_regression.png
  Saved: figures/roc_curve_logistic_regression.png
  Saved: figures/prediction_distribution_logistic_regression.png
  Saved: figures/confusion_matrix_random_forest.png
  Saved: figures/roc_curve_random_forest.png
  Saved: figures/prediction_distribution_random_forest.png

Saving trained models...
  Saved: models/logistic_regression.pkl
  Saved: models/random_forest.pkl

MODELING COMPLETE

Key Findings:
  - High overall accuracy due to class imbalance
  - Low recall for minority risk class (expected)
  - Results demonstrate policy-relevant tradeoffs

Key Outputs:
  - Trained models: models/
  - Evaluation plots: figures/

Next Step: Run 04_policy_comparison.py
