In [2]:
"""
Complete Model Fix - Enhanced Feature Engineering
==================================================
Creates meaningful features from text and builds effective risk models.

Part of: Policy Risk Inference from Simulated Reports
Author: William V. Fullerton
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from imblearn.over_sampling import SMOTE

sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


class EnhancedFeatureEngineer:
    """Create meaningful features from text data."""
    
    def __init__(self, df):
        """Initialize with dataframe."""
        self.df = df.copy()
        print(f"Initialized with {len(self.df)} reports")
        
    def analyze_text_content(self):
        """Understand what's in the text."""
        print("\n" + "="*70)
        print("ANALYZING TEXT CONTENT")
        print("="*70)
        
        # Find text column
        text_cols = [col for col in self.df.columns if 'text' in col.lower()]
        print(f"\nText columns found: {text_cols}")
        
        # Use 'text' or 'cleaned_text'
        if 'text' in self.df.columns:
            text_col = 'text'
        elif 'cleaned_text' in self.df.columns:
            text_col = 'cleaned_text'
        else:
            print("ERROR: No text column found!")
            return
        
        print(f"Using column: '{text_col}'")
        
        # Sample texts
        print("\nSample texts:")
        for i in range(min(3, len(self.df))):
            text = str(self.df[text_col].iloc[i])[:200]
            print(f"\n  Report {i+1}: {text}...")
        
        # Text statistics
        self.df['text_length_words'] = self.df[text_col].astype(str).apply(lambda x: len(x.split()))
        self.df['text_length_chars'] = self.df[text_col].astype(str).apply(lambda x: len(x))
        
        print(f"\nText length statistics:")
        print(f"  Mean words: {self.df['text_length_words'].mean():.1f}")
        print(f"  Mean chars: {self.df['text_length_chars'].mean():.1f}")
        print(f"  Min words: {self.df['text_length_words'].min()}")
        print(f"  Max words: {self.df['text_length_words'].max()}")
        
        return text_col
    
    def create_tfidf_features(self, text_col, max_features=50):
        """Extract TF-IDF features."""
        print("\n" + "="*70)
        print("CREATING TF-IDF FEATURES")
        print("="*70)
        
        print(f"\nExtracting top {max_features} TF-IDF features...")
        
        vectorizer = TfidfVectorizer(
            max_features=max_features,
            min_df=2,
            max_df=0.9,
            ngram_range=(1, 2),  # Include bigrams
            stop_words='english'
        )
        
        texts = self.df[text_col].fillna('').astype(str)
        tfidf_matrix = vectorizer.fit_transform(texts)
        
        # Convert to dataframe
        feature_names = [f'tfidf_{name}' for name in vectorizer.get_feature_names_out()]
        tfidf_df = pd.DataFrame(
            tfidf_matrix.toarray(),
            columns=feature_names,
            index=self.df.index
        )
        
        # Add to main dataframe
        self.df = pd.concat([self.df, tfidf_df], axis=1)
        
        print(f"Created {len(feature_names)} TF-IDF features")
        print(f"Sample features: {feature_names[:5]}")
        
        return feature_names
    
    def create_metadata_features(self):
        """Create features from metadata columns."""
        print("\n" + "="*70)
        print("CREATING METADATA FEATURES")
        print("="*70)
        
        new_features = []
        
        # Sentiment encoding
        if 'sentiment' in self.df.columns:
            sentiment_map = {'positive': 1, 'neutral': 0, 'negative': -1}
            self.df['sentiment_encoded'] = self.df['sentiment'].map(sentiment_map)
            if self.df['sentiment_encoded'].isna().any():
                self.df['sentiment_encoded'].fillna(0, inplace=True)
            new_features.append('sentiment_encoded')
            print("  ✓ Created sentiment_encoded")
        
        # Load factor (if exists)
        if 'load_factor' in self.df.columns:
            self.df['load_factor_high'] = (self.df['load_factor'] > self.df['load_factor'].median()).astype(int)
            new_features.append('load_factor')
            new_features.append('load_factor_high')
            print("  ✓ Using load_factor features")
        
        # Agents and capacity
        if 'agents' in self.df.columns and 'capacity' in self.df.columns:
            self.df['utilization'] = self.df['agents'] / (self.df['capacity'] + 1)
            self.df['is_overcapacity'] = (self.df['agents'] > self.df['capacity']).astype(int)
            new_features.extend(['agents', 'capacity', 'utilization', 'is_overcapacity'])
            print("  ✓ Created capacity utilization features")
        
        # Topic encoding (if categorical)
        if 'topic' in self.df.columns:
            topic_dummies = pd.get_dummies(self.df['topic'], prefix='topic')
            self.df = pd.concat([self.df, topic_dummies], axis=1)
            new_features.extend(topic_dummies.columns.tolist())
            print(f"  ✓ Created {len(topic_dummies.columns)} topic features")
        
        # Style encoding
        if 'style' in self.df.columns:
            style_dummies = pd.get_dummies(self.df['style'], prefix='style')
            self.df = pd.concat([self.df, style_dummies], axis=1)
            new_features.extend(style_dummies.columns.tolist())
            print(f"  ✓ Created {len(style_dummies.columns)} style features")
        
        print(f"\nTotal metadata features: {len(new_features)}")
        return new_features
    
    def create_intelligent_target(self):
        """Create a meaningful target variable."""
        print("\n" + "="*70)
        print("CREATING INTELLIGENT TARGET VARIABLE")
        print("="*70)
        
        # Strategy: Use multiple signals to define risk
        risk_signals = []
        
        # Signal 1: Negative sentiment
        if 'sentiment' in self.df.columns:
            risk_signals.append(self.df['sentiment'] == 'negative')
            print("  ✓ Using negative sentiment as risk signal")
        
        # Signal 2: High load factor
        if 'load_factor' in self.df.columns:
            threshold = self.df['load_factor'].quantile(0.85)
            risk_signals.append(self.df['load_factor'] > threshold)
            print(f"  ✓ Using high load factor (>{threshold:.2f}) as risk signal")
        
        # Signal 3: Overcapacity
        if 'agents' in self.df.columns and 'capacity' in self.df.columns:
            risk_signals.append(self.df['agents'] > self.df['capacity'])
            print("  ✓ Using overcapacity as risk signal")
        
        # Signal 4: Long text (might indicate complex issues)
        if 'text_length_words' in self.df.columns:
            threshold = self.df['text_length_words'].quantile(0.90)
            risk_signals.append(self.df['text_length_words'] > threshold)
            print(f"  ✓ Using long reports (>{threshold:.0f} words) as risk signal")
        
        # Combine signals: High risk if 2 or more signals are true
        if risk_signals:
            risk_score = sum(risk_signals)
            self.df['risk_score'] = risk_score
            
            # Binary classification: High risk if 2+ signals
            self.df['risk_label'] = (risk_score >= 2).astype(int)
            
            print(f"\nRisk score distribution:")
            print(self.df['risk_score'].value_counts().sort_index())
            
            print(f"\nRisk label distribution:")
            print(self.df['risk_label'].value_counts())
            pct_high_risk = 100 * self.df['risk_label'].sum() / len(self.df)
            print(f"  High-risk: {pct_high_risk:.1f}%")
            
            if pct_high_risk == 0 or pct_high_risk == 100:
                print("\n  ⚠ WARNING: All samples are one class!")
                print("  Creating balanced synthetic labels...")
                np.random.seed(42)
                self.df['risk_label'] = np.random.choice([0, 1], size=len(self.df), p=[0.85, 0.15])
                print(f"\n  New distribution:")
                print(self.df['risk_label'].value_counts())
        else:
            print("\n  ⚠ No risk signals available, creating synthetic labels")
            np.random.seed(42)
            self.df['risk_label'] = np.random.choice([0, 1], size=len(self.df), p=[0.85, 0.15])
        
        return self.df


class ImprovedRiskModel:
    """Build and evaluate improved risk classification model."""
    
    def __init__(self, df, feature_cols):
        """Initialize with dataframe and feature columns."""
        self.df = df.copy()
        self.feature_cols = feature_cols
        self.models = {}
        self.best_model_name = None
        print(f"\nInitialized with {len(df)} samples and {len(feature_cols)} features")
        
    def prepare_data(self, target_col='risk_label', test_size=0.2):
        """Prepare features and target."""
        print("\n" + "="*70)
        print("PREPARING DATA")
        print("="*70)
        
        # Get features
        X = self.df[self.feature_cols].copy()
        
        # Handle any remaining NaN values
        X = X.fillna(0)
        
        # Get target
        if target_col not in self.df.columns:
            print(f"ERROR: Target column '{target_col}' not found")
            return None
        
        y = self.df[target_col].values
        
        # Check class distribution
        unique, counts = np.unique(y, return_counts=True)
        print(f"\nTarget distribution:")
        for label, count in zip(unique, counts):
            print(f"  Class {label}: {count} ({100*count/len(y):.1f}%)")
        
        if len(unique) == 1:
            print("\n  ⚠ CRITICAL: Only one class present!")
            print("  Cannot train a classifier with one class.")
            return None
        
        # Split data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )
        
        print(f"\nTrain set: {len(self.X_train)} samples")
        print(f"Test set:  {len(self.X_test)} samples")
        print(f"Features:  {len(self.feature_cols)}")
        
        # Scale features
        self.scaler = StandardScaler()
        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        
        return True
    
    def train_all_models(self):
        """Train multiple models with proper handling of imbalanced data."""
        print("\n" + "="*70)
        print("TRAINING MODELS")
        print("="*70)
        
        # Model 1: Logistic Regression with SMOTE
        print("\n1. Logistic Regression + SMOTE")
        print("-" * 40)
        try:
            smote = SMOTE(random_state=42, k_neighbors=min(5, sum(self.y_train == 1) - 1))
            X_train_smote, y_train_smote = smote.fit_resample(self.X_train_scaled, self.y_train)
            
            lr_model = LogisticRegression(max_iter=1000, random_state=42)
            lr_model.fit(X_train_smote, y_train_smote)
            
            self.models['logistic_smote'] = {
                'model': lr_model,
                'scaler': self.scaler,
                'description': 'Logistic Regression + SMOTE'
            }
            print("✓ Trained successfully")
        except Exception as e:
            print(f"✗ Failed: {e}")
        
        # Model 2: Logistic Regression with Class Weights
        print("\n2. Logistic Regression + Class Weights")
        print("-" * 40)
        lr_weighted = LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        )
        lr_weighted.fit(self.X_train_scaled, self.y_train)
        
        self.models['logistic_weighted'] = {
            'model': lr_weighted,
            'scaler': self.scaler,
            'description': 'Logistic Regression + Balanced Weights'
        }
        print("✓ Trained successfully")
        
        # Model 3: Random Forest with Class Weights
        print("\n3. Random Forest + Class Weights")
        print("-" * 40)
        rf_model = RandomForestClassifier(
            n_estimators=100,
            class_weight='balanced',
            max_depth=10,
            random_state=42
        )
        rf_model.fit(self.X_train_scaled, self.y_train)
        
        self.models['random_forest'] = {
            'model': rf_model,
            'scaler': self.scaler,
            'description': 'Random Forest + Balanced Weights'
        }
        print("✓ Trained successfully")
        
        # Model 4: Gradient Boosting
        print("\n4. Gradient Boosting")
        print("-" * 40)
        # Calculate sample weights
        class_counts = np.bincount(self.y_train)
        sample_weights = np.ones(len(self.y_train))
        sample_weights[self.y_train == 1] = class_counts[0] / class_counts[1]
        
        gb_model = GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=42
        )
        gb_model.fit(self.X_train_scaled, self.y_train, sample_weight=sample_weights)
        
        self.models['gradient_boosting'] = {
            'model': gb_model,
            'scaler': self.scaler,
            'description': 'Gradient Boosting + Sample Weights'
        }
        print("✓ Trained successfully")
        
        print(f"\n✓ Trained {len(self.models)} models successfully")
    
    def evaluate_all_models(self):
        """Evaluate all trained models."""
        print("\n" + "="*70)
        print("MODEL EVALUATION")
        print("="*70)
        
        results = []
        
        for model_name, model_dict in self.models.items():
            print(f"\n{model_dict['description']}")
            print("-" * 40)
            
            model = model_dict['model']
            
            # Predictions
            y_pred = model.predict(self.X_test_scaled)
            y_proba = model.predict_proba(self.X_test_scaled)[:, 1]
            
            # Metrics
            cm = confusion_matrix(self.y_test, y_pred)
            print("\nConfusion Matrix:")
            print(cm)
            
            if cm.size == 4:
                tn, fp, fn, tp = cm.ravel()
            else:
                tn, fp, fn, tp = cm[0, 0], 0, 0, 0
            
            accuracy = (tp + tn) / len(self.y_test)
            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            # ROC-AUC
            try:
                auc = roc_auc_score(self.y_test, y_proba)
            except:
                auc = 0.5
            
            print(f"\nMetrics:")
            print(f"  Accuracy:  {accuracy:.3f}")
            print(f"  Precision: {precision:.3f}")
            print(f"  Recall:    {recall:.3f}")
            print(f"  F1-Score:  {f1:.3f}")
            print(f"  ROC-AUC:   {auc:.3f}")
            
            results.append({
                'Model': model_dict['description'],
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1': f1,
                'AUC': auc,
                'FP': fp,
                'FN': fn
            })
            
            # Store predictions
            model_dict['y_pred'] = y_pred
            model_dict['y_proba'] = y_proba
        
        # Compare models
        results_df = pd.DataFrame(results)
        print("\n" + "="*70)
        print("MODEL COMPARISON")
        print("="*70)
        print("\n" + results_df.to_string(index=False))
        
        # Select best model (by F1 score)
        best_idx = results_df['F1'].idxmax()
        self.best_model_name = list(self.models.keys())[best_idx]
        print(f"\n✓ Best model: {self.models[self.best_model_name]['description']}")
        print(f"  F1-Score: {results_df.iloc[best_idx]['F1']:.3f}")
        
        return results_df
    
    def visualize_results(self, output_dir='figures'):
        """Create visualizations."""
        os.makedirs(output_dir, exist_ok=True)
        
        print("\n" + "="*70)
        print("GENERATING VISUALIZATIONS")
        print("="*70)
        
        best_model_dict = self.models[self.best_model_name]
        
        # 1. Confusion Matrix
        cm = confusion_matrix(self.y_test, best_model_dict['y_pred'])
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Low Risk', 'High Risk'],
                   yticklabels=['Low Risk', 'High Risk'])
        plt.title(f'Confusion Matrix: {best_model_dict["description"]}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'{output_dir}/best_model_confusion_matrix.png', dpi=300)
        print(f"  ✓ Saved confusion matrix")
        plt.close()
        
        # 2. ROC Curve
        fpr, tpr, _ = roc_curve(self.y_test, best_model_dict['y_proba'])
        auc = roc_auc_score(self.y_test, best_model_dict['y_proba'])
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, linewidth=2, label=f'ROC (AUC = {auc:.3f})')
        plt.plot([0, 1], [0, 1], 'k--', label='Random')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve: {best_model_dict["description"]}')
        plt.legend()
        plt.grid(alpha=0.3)
        plt.tight_layout()
        plt.savefig(f'{output_dir}/best_model_roc_curve.png', dpi=300)
        print(f"  ✓ Saved ROC curve")
        plt.close()
        
        # 3. Feature Importance (if available)
        if hasattr(best_model_dict['model'], 'feature_importances_'):
            importances = best_model_dict['model'].feature_importances_
            indices = np.argsort(importances)[-20:]
            
            plt.figure(figsize=(10, 8))
            plt.barh(range(len(indices)), importances[indices])
            plt.yticks(range(len(indices)), [self.feature_cols[i] for i in indices])
            plt.xlabel('Feature Importance')
            plt.title('Top 20 Most Important Features')
            plt.tight_layout()
            plt.savefig(f'{output_dir}/feature_importance.png', dpi=300)
            print(f"  ✓ Saved feature importance")
            plt.close()
        elif hasattr(best_model_dict['model'], 'coef_'):
            coef = np.abs(best_model_dict['model'].coef_[0])
            indices = np.argsort(coef)[-20:]
            
            plt.figure(figsize=(10, 8))
            plt.barh(range(len(indices)), coef[indices])
            plt.yticks(range(len(indices)), [self.feature_cols[i] for i in indices])
            plt.xlabel('|Coefficient|')
            plt.title('Top 20 Most Important Features')
            plt.tight_layout()
            plt.savefig(f'{output_dir}/feature_importance.png', dpi=300)
            print(f"  ✓ Saved feature importance")
            plt.close()
    
    def save_best_model(self, output_dir='models'):
        """Save the best model."""
        os.makedirs(output_dir, exist_ok=True)
        
        best_model_dict = self.models[self.best_model_name]
        
        model_package = {
            'model': best_model_dict['model'],
            'scaler': best_model_dict['scaler'],
            'feature_cols': self.feature_cols,
            'description': best_model_dict['description']
        }
        
        filepath = f'{output_dir}/best_risk_model.pkl'
        with open(filepath, 'wb') as f:
            pickle.dump(model_package, f)
        
        print(f"\n✓ Saved best model to: {filepath}")


def main():
    """Execute complete modeling pipeline."""
    print("="*70)
    print("COMPLETE MODEL FIX - ENHANCED PIPELINE")
    print("="*70)
    
    # Load data
    data_path = 'data/processed/reports_with_features_and_labels.csv'
    
    if not os.path.exists(data_path):
        print(f"\nERROR: File not found: {data_path}")
        print("Please run scripts 00 and 02 first")
        return
    
    print(f"\nLoading data from: {data_path}")
    df = pd.read_csv(data_path)
    
    # Enhanced feature engineering
    engineer = EnhancedFeatureEngineer(df)
    text_col = engineer.analyze_text_content()
    
    if text_col:
        tfidf_features = engineer.create_tfidf_features(text_col, max_features=50)
    else:
        tfidf_features = []
    
    metadata_features = engineer.create_metadata_features()
    df_enhanced = engineer.create_intelligent_target()
    
    # Combine all features
    all_features = tfidf_features + metadata_features
    if 'text_length_words' in df_enhanced.columns:
        all_features.append('text_length_words')
    if 'text_length_chars' in df_enhanced.columns:
        all_features.append('text_length_chars')
    
    print(f"\nTotal features created: {len(all_features)}")
    
    # Build models
    model_builder = ImprovedRiskModel(df_enhanced, all_features)
    
    success = model_builder.prepare_data()
    if not success:
        print("\nERROR: Could not prepare data for modeling")
        return
    
    model_builder.train_all_models()
    results_df = model_builder.evaluate_all_models()
    model_builder.visualize_results()
    model_builder.save_best_model()
    
    # Save results
    os.makedirs('reports', exist_ok=True)
    results_df.to_csv('reports/final_model_comparison.csv', index=False)
    
    # Save enhanced dataset
    df_enhanced.to_csv('data/processed/reports_final_with_all_features.csv', index=False)
    
    print("\n" + "="*70)
    print("COMPLETE MODEL PIPELINE FINISHED")
    print("="*70)
    print("\nKey Outputs:")
    print("  - Best model: models/best_risk_model.pkl")
    print("  - Model comparison: reports/final_model_comparison.csv")
    print("  - Enhanced dataset: data/processed/reports_final_with_all_features.csv")
    print("  - Visualizations: figures/")
    print("\n✓ Project complete and ready for presentation!")


if __name__ == "__main__":
    main()

COMPLETE MODEL FIX - ENHANCED PIPELINE

Loading data from: data/processed/reports_with_features_and_labels.csv
Initialized with 3000 reports

ANALYZING TEXT CONTENT

Text columns found: ['text', 'cleaned_text', 'text_length']
Using column: 'text'

Sample texts:

  Report 1: an analysis was conducted to determine that infrastructure load may increase rapidly, impacting risk exposure. current load was 0.35 with 124 active agents against a capacity of 112. conditions remain...

  Report 2: an analysis was conducted to determine that traffic flow appears to decrease rapidly, impacting risk exposure. current load is 0.25 with 318 active agents against a capacity of 179....

  Report 3: this report outlines that traffic flow appears to stabilize slightly, impacting system performance. current load is 0.42 with 152 active agents against a capacity of 137. this raises concerns about fu...

Text length statistics:
  Mean words: 29.2
  Mean chars: 201.0
  Min words: 23
  Max words: 37

CREATING 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  self.df['sentiment_encoded'].fillna(0, inplace=True)


✓ Trained successfully

2. Logistic Regression + Class Weights
----------------------------------------
✓ Trained successfully

3. Random Forest + Class Weights
----------------------------------------
✓ Trained successfully

4. Gradient Boosting
----------------------------------------
✓ Trained successfully

✓ Trained 4 models successfully

MODEL EVALUATION

Logistic Regression + SMOTE
----------------------------------------

Confusion Matrix:
[[450  58]
 [ 11  81]]

Metrics:
  Accuracy:  0.885
  Precision: 0.583
  Recall:    0.880
  F1-Score:  0.701
  ROC-AUC:   0.964

Logistic Regression + Balanced Weights
----------------------------------------

Confusion Matrix:
[[440  68]
 [  9  83]]

Metrics:
  Accuracy:  0.872
  Precision: 0.550
  Recall:    0.902
  F1-Score:  0.683
  ROC-AUC:   0.963

Random Forest + Balanced Weights
----------------------------------------

Confusion Matrix:
[[508   0]
 [  6  86]]

Metrics:
  Accuracy:  0.990
  Precision: 1.000
  Recall:    0.935
  F1-Scor