In [11]:
# Import necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import re
import os
from pathlib import Path

DATA_PATH = Path("data/processed_dataset.csv")

print(f"\nLoading data from: {DATA_PATH.resolve()}")
df = pd.read_csv(DATA_PATH)


# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)


Loading data from: C:\Users\rfull\Building Data Together Weeklies\Autonomous Infrastructure Risk\data\processed_dataset.csv


In [17]:



class TextPreprocessor:
    """Handle text cleaning and normalization."""
    
    @staticmethod
    def clean_text(text):
        """Clean and normalize report text."""
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove special characters but keep spaces and basic punctuation
        text = re.sub(r'[^a-z0-9\s\.\,\!\?]', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    @staticmethod
    def extract_risk_lexicon_features(text):
        """Extract custom lexical risk markers from text."""
        risk_keywords = {
            'high_severity': ['critical', 'severe', 'urgent', 'emergency', 'dangerous'],
            'violation': ['violation', 'breach', 'non-compliant', 'unauthorized'],
            'financial': ['loss', 'deficit', 'overrun', 'penalty', 'fine'],
            'temporal': ['delayed', 'overdue', 'late', 'missed', 'behind']
        }
        
        features = {}
        text_lower = text.lower()
        
        for category, keywords in risk_keywords.items():
            count = sum(text_lower.count(keyword) for keyword in keywords)
            features[f'risk_{category}_count'] = count
        
        # Total risk keyword density
        total_words = len(text.split())
        total_risk_words = sum(features.values())
        features['risk_density'] = total_risk_words / max(total_words, 1)
        
        return features


class LanguageAnalyzer:
    """Perform comprehensive language analysis on report corpus."""
    
    def __init__(self, df):
        """Initialize analyzer with dataframe."""
        self.df = df.copy()
        self.tfidf_vectorizer = None
        self.tfidf_matrix = None
        print(f"Initialized with {len(self.df)} reports")
        print(f"Columns: {list(self.df.columns)}")
        
    def preprocess_reports(self, text_column='report_text'):
        """Clean and normalize all report texts."""
        print("\nPreprocessing report texts...")
        preprocessor = TextPreprocessor()
        
        # Check if text column exists
        if text_column not in self.df.columns:
            print(f"Warning: '{text_column}' column not found.")
            print(f"Available columns: {list(self.df.columns)}")
            # Try to find a text-like column
            text_cols = [col for col in self.df.columns if 'text' in col.lower() or 'report' in col.lower()]
            if text_cols:
                text_column = text_cols[0]
                print(f"Using column: '{text_column}' instead")
            else:
                print("Error: Cannot find text column")
                return self
        
        # Clean text
        self.df['cleaned_text'] = self.df[text_column].apply(
            preprocessor.clean_text
        )
        
        # Extract lexical risk features
        print("Extracting lexical risk features...")
        risk_features = self.df['cleaned_text'].apply(
            preprocessor.extract_risk_lexicon_features
        )
        risk_df = pd.DataFrame(risk_features.tolist())
        
        # Merge with main dataframe
        self.df = pd.concat([self.df, risk_df], axis=1)
        
        print(f"Added {len(risk_df.columns)} risk feature columns")
        return self
    
    def extract_tfidf_features(self, max_features=100):
        """Extract TF-IDF features from cleaned text."""
        print(f"\nExtracting TF-IDF features (top {max_features})...")
        
        self.tfidf_vectorizer = TfidfVectorizer(
            max_features=max_features,
            min_df=2,
            max_df=0.8,
            stop_words='english'
        )
        
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(
            self.df['cleaned_text']
        )
        
        print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        
        # Get feature names
        feature_names = self.tfidf_vectorizer.get_feature_names_out()
        print(f"Sample features: {list(feature_names[:10])}")
        
        return self
    
    def analyze_language_by_risk(self, risk_column='risk_label'):
        """Analyze language patterns grouped by risk level."""
        print(f"\nAnalyzing language patterns by {risk_column}...")
        
        if risk_column not in self.df.columns:
            print(f"Warning: {risk_column} not found.")
            print(f"Available columns: {list(self.df.columns)}")
            # Try to find a risk/label column
            label_cols = [col for col in self.df.columns if 'risk' in col.lower() or 'label' in col.lower()]
            if label_cols:
                risk_column = label_cols[0]
                print(f"Using column: '{risk_column}' instead")
            else:
                print("Creating dummy labels for demonstration...")
                self.df[risk_column] = np.random.choice([0, 1], size=len(self.df), p=[0.98, 0.02])
        
        # Risk distribution
        risk_dist = self.df[risk_column].value_counts()
        print(f"\nRisk label distribution:")
        print(risk_dist)
        
        # Compare lexical features by risk group
        risk_features = [col for col in self.df.columns if col.startswith('risk_')]
        
        if risk_features:
            print(f"\nMean risk features by group:")
            feature_comparison = self.df.groupby(risk_column)[risk_features].mean()
            print(feature_comparison)
        else:
            print("No risk features found to compare")
            feature_comparison = None
        
        return feature_comparison
    
    def visualize_language_patterns(self, risk_column='risk_label', output_dir='figures'):
        """Create visualizations of language patterns."""
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"\nGenerating visualizations...")
        
        # Make sure risk column exists
        if risk_column not in self.df.columns:
            label_cols = [col for col in self.df.columns if 'risk' in col.lower() or 'label' in col.lower()]
            if label_cols:
                risk_column = label_cols[0]
            else:
                self.df[risk_column] = np.random.choice([0, 1], size=len(self.df), p=[0.98, 0.02])
        
        # 1. Risk feature distributions by class
        risk_features = [col for col in self.df.columns if col.startswith('risk_')]
        
        if risk_features:
            fig, axes = plt.subplots(2, 2, figsize=(14, 10))
            axes = axes.flatten()
            
            for idx, feature in enumerate(risk_features[:4]):
                if idx < len(axes):
                    self.df.boxplot(column=feature, by=risk_column, ax=axes[idx])
                    axes[idx].set_title(f'{feature} by Risk Level')
                    axes[idx].set_xlabel('Risk Label')
                    axes[idx].set_ylabel(feature)
            
            plt.suptitle('Lexical Risk Features by Risk Class', y=1.02)
            plt.tight_layout()
            plt.savefig(f'{output_dir}/risk_features_by_class.png', dpi=300, bbox_inches='tight')
            print(f"Saved: {output_dir}/risk_features_by_class.png")
            plt.close()
        
        # 2. Text length distribution
        self.df['text_length'] = self.df['cleaned_text'].str.split().str.len()
        
        plt.figure(figsize=(10, 6))
        for label in self.df[risk_column].unique():
            subset = self.df[self.df[risk_column] == label]['text_length']
            plt.hist(subset, alpha=0.6, bins=30, label=f'Risk={label}')
        
        plt.xlabel('Text Length (words)')
        plt.ylabel('Frequency')
        plt.title('Report Length Distribution by Risk Class')
        plt.legend()
        plt.savefig(f'{output_dir}/text_length_distribution.png', dpi=300, bbox_inches='tight')
        print(f"Saved: {output_dir}/text_length_distribution.png")
        plt.close()
        
        # 3. Top TF-IDF terms
        if self.tfidf_matrix is not None:
            feature_names = self.tfidf_vectorizer.get_feature_names_out()
            mean_tfidf = np.asarray(self.tfidf_matrix.mean(axis=0)).flatten()
            top_indices = mean_tfidf.argsort()[-20:][::-1]
            
            plt.figure(figsize=(10, 8))
            plt.barh(range(20), mean_tfidf[top_indices])
            plt.yticks(range(20), [feature_names[i] for i in top_indices])
            plt.xlabel('Mean TF-IDF Score')
            plt.title('Top 20 TF-IDF Terms Across Corpus')
            plt.tight_layout()
            plt.savefig(f'{output_dir}/top_tfidf_terms.png', dpi=300, bbox_inches='tight')
            print(f"Saved: {output_dir}/top_tfidf_terms.png")
            plt.close()
        
        return self
    
    def get_processed_dataframe(self):
        """Return the processed dataframe with all features."""
        return self.df


def main():
    """Execute full language analysis pipeline."""
    print("=" * 70)
    print("REPORT LANGUAGE ANALYSIS PIPELINE")
    print("=" * 70)
    
    # LOAD YOUR DATA HERE
    # Replace this path with your actual file path
    data_path = 'data/processed_dataset.csv'
    
    print(f"\nLoading data from: {data_path}")
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} rows")
    
    # Initialize analyzer with your dataframe
    analyzer = LanguageAnalyzer(df)
    
    # Run analysis pipeline
    analyzer.preprocess_reports()
    analyzer.extract_tfidf_features(max_features=100)
    
    # Analyze by risk group
    feature_comparison = analyzer.analyze_language_by_risk()
    
    # Generate visualizations
    analyzer.visualize_language_patterns()
    
    # Get processed dataframe
    processed_df = analyzer.get_processed_dataframe()
    
    # Save results
    output_path = 'data/processed/reports_with_features.csv'
    os.makedirs('data/processed', exist_ok=True)
    processed_df.to_csv(output_path, index=False)
    print(f"\nSaved processed data with features to: {output_path}")
    
    print("\n" + "=" * 70)
    print("ANALYSIS COMPLETE")
    print("=" * 70)
    print("\nKey Outputs:")
    print("  - Processed features: data/processed/reports_with_features.csv")
    print("  - Visualizations: figures/")
    print("\nNext Step: Run 03_risk_inference_model.py")
    
    return processed_df


if __name__ == "__main__":
    main()

REPORT LANGUAGE ANALYSIS PIPELINE

Loading data from: data/processed_dataset.csv
Loaded 3000 rows
Initialized with 3000 reports
Columns: ['id', 'timestamp', 'style', 'topic', 'sentiment', 'load_factor', 'agents', 'capacity', 'text', 'style_id', 'topic_id', 'sentiment_id']

Preprocessing report texts...
Available columns: ['id', 'timestamp', 'style', 'topic', 'sentiment', 'load_factor', 'agents', 'capacity', 'text', 'style_id', 'topic_id', 'sentiment_id']
Using column: 'text' instead
Extracting lexical risk features...
Added 5 risk feature columns

Extracting TF-IDF features (top 100)...
TF-IDF matrix shape: (3000, 100)
Sample features: ['09', '11', '12', '13', '14', '15', '16', '17', '18', '19']

Analyzing language patterns by risk_label...
Available columns: ['id', 'timestamp', 'style', 'topic', 'sentiment', 'load_factor', 'agents', 'capacity', 'text', 'style_id', 'topic_id', 'sentiment_id', 'cleaned_text', 'risk_high_severity_count', 'risk_violation_count', 'risk_financial_count', 'r