# Human-in-the-Loop Data Analysis & ML Pipeline 🤖👥
## Hackathon Project - October 4th, 2025

**Theme:** Human-in-the-Loop (HITL)  
**Tracks:** Healthcare & Sustainability  
**Team:** TattleTale

---

## Project Overview
This notebook demonstrates a comprehensive Human-in-the-Loop approach to data analysis and machine learning, where:
- **AI provides insights and predictions**
- **Humans validate, correct, and guide the process**
- **Continuous feedback improves model performance**

### HITL Applications:
- **Healthcare:** Medical diagnosis assistance with doctor validation
- **Sustainability:** Environmental monitoring with expert review

---

## 1. Import Required Libraries
Let's start by importing all the essential libraries for our HITL data analysis pipeline.

In [None]:
# Core data manipulation and analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score, roc_curve, auc
import warnings
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output

# For HITL interactivity
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

# Set style and suppress warnings
plt.style.use('seaborn-v0_8')
warnings.filterwarnings('ignore')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("🎯 Ready for Human-in-the-Loop Data Analysis!")

## 2. Load and Explore Dataset
**HITL Approach:** Let's load sample data and create an interactive exploration interface where humans can guide the analysis process.

In [None]:
# Create sample datasets for both Healthcare and Sustainability tracks
np.random.seed(42)

def create_healthcare_dataset(n_samples=1000):
    """Create a sample healthcare dataset for medical diagnosis prediction"""
    data = {
        'age': np.random.normal(50, 15, n_samples).astype(int),
        'bmi': np.random.normal(25, 5, n_samples),
        'blood_pressure_systolic': np.random.normal(120, 20, n_samples),
        'blood_pressure_diastolic': np.random.normal(80, 10, n_samples),
        'cholesterol': np.random.normal(200, 40, n_samples),
        'glucose_level': np.random.normal(100, 20, n_samples),
        'family_history': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]),
        'smoking': np.random.choice([0, 1], n_samples, p=[0.8, 0.2]),
        'exercise_hours_week': np.random.exponential(3, n_samples)
    }
    
    # Create target variable (risk of heart disease)
    risk_score = (
        (data['age'] - 30) * 0.1 +
        (data['bmi'] - 25) * 0.3 +
        (data['blood_pressure_systolic'] - 120) * 0.05 +
        (data['cholesterol'] - 200) * 0.02 +
        data['family_history'] * 2 +
        data['smoking'] * 1.5 +
        np.random.normal(0, 1, n_samples)
    )
    
    data['heart_disease_risk'] = (risk_score > np.percentile(risk_score, 70)).astype(int)
    return pd.DataFrame(data)

def create_sustainability_dataset(n_samples=1000):
    """Create a sample sustainability dataset for energy consumption prediction"""
    data = {
        'temperature': np.random.normal(20, 10, n_samples),
        'humidity': np.random.normal(60, 15, n_samples),
        'wind_speed': np.random.exponential(5, n_samples),
        'solar_radiation': np.random.gamma(2, 2, n_samples) * 100,
        'building_age': np.random.uniform(1, 50, n_samples),
        'occupancy': np.random.poisson(10, n_samples),
        'building_type': np.random.choice(['residential', 'commercial', 'industrial'], n_samples),
        'insulation_rating': np.random.uniform(1, 10, n_samples),
        'renewable_energy': np.random.choice([0, 1], n_samples, p=[0.6, 0.4])
    }
    
    # Create target variable (energy consumption)
    base_consumption = (
        50 + data['building_age'] * 2 +
        data['occupancy'] * 10 +
        (data['temperature'] < 18) * 20 +
        (data['temperature'] > 25) * 30 +
        (data['building_type'] == 'commercial') * 50 +
        (data['building_type'] == 'industrial') * 100 +
        (10 - data['insulation_rating']) * 5 +
        data['renewable_energy'] * -30 +
        np.random.normal(0, 20, n_samples)
    )
    
    data['energy_consumption'] = np.maximum(base_consumption, 10)  # Minimum consumption
    return pd.DataFrame(data)

# Interactive dataset selection
@interact
def select_dataset(dataset_type=['Healthcare', 'Sustainability']):
    global df, target_column, problem_type
    
    if dataset_type == 'Healthcare':
        df = create_healthcare_dataset()
        target_column = 'heart_disease_risk'
        problem_type = 'classification'
        print("🏥 Healthcare Dataset Loaded - Heart Disease Risk Prediction")
        print("📊 Classification Problem")
    else:
        df = create_sustainability_dataset()
        target_column = 'energy_consumption'
        problem_type = 'regression'
        print("🌱 Sustainability Dataset Loaded - Energy Consumption Prediction")
        print("📊 Regression Problem")
    
    print(f"\n📋 Dataset Shape: {df.shape}")
    print(f"🎯 Target Variable: {target_column}")
    print("\n" + "="*50)
    display(df.head())
    
    return df

## 3. Data Cleaning and Preprocessing
**HITL Approach:** Interactive data quality assessment where humans can review and approve AI-suggested cleaning operations.

In [None]:
class HITLDataCleaner:
    """Human-in-the-Loop Data Cleaning Assistant"""
    
    def __init__(self, dataframe):
        self.df = dataframe.copy()
        self.original_df = dataframe.copy()
        self.cleaning_log = []
    
    def assess_data_quality(self):
        """AI analyzes data quality and suggests cleaning operations"""
        print("🤖 AI Assessment: Analyzing data quality...")
        print("="*50)
        
        # Missing values
        missing_data = self.df.isnull().sum()
        if missing_data.sum() > 0:
            print(f"⚠️  Missing values detected:")
            for col, count in missing_data[missing_data > 0].items():
                print(f"   - {col}: {count} missing ({count/len(self.df)*100:.1f}%)")
        else:
            print("✅ No missing values detected")
        
        # Duplicates
        duplicates = self.df.duplicated().sum()
        if duplicates > 0:
            print(f"⚠️  {duplicates} duplicate rows detected")
        else:
            print("✅ No duplicate rows detected")
        
        # Outliers (for numerical columns)
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns
        outlier_cols = []
        for col in numerical_cols:
            Q1 = self.df[col].quantile(0.25)
            Q3 = self.df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((self.df[col] < Q1 - 1.5*IQR) | (self.df[col] > Q3 + 1.5*IQR)).sum()
            if outliers > 0:
                outlier_cols.append((col, outliers))
        
        if outlier_cols:
            print(f"⚠️  Potential outliers detected:")
            for col, count in outlier_cols:
                print(f"   - {col}: {count} outliers ({count/len(self.df)*100:.1f}%)")
        else:
            print("✅ No significant outliers detected")
        
        return missing_data, duplicates, outlier_cols
    
    def interactive_cleaning(self):
        """Interactive interface for human review of AI suggestions"""
        missing_data, duplicates, outlier_cols = self.assess_data_quality()
        
        # Handle missing values
        if missing_data.sum() > 0:
            print("\n🧹 Cleaning Suggestions:")
            for col in missing_data[missing_data > 0].index:
                if self.df[col].dtype in ['object']:
                    suggestion = f"Fill '{col}' missing values with mode: '{self.df[col].mode()[0]}'"
                else:
                    suggestion = f"Fill '{col}' missing values with median: {self.df[col].median():.2f}"
                
                print(f"💡 AI Suggests: {suggestion}")
                # In a real HITL system, this would be an interactive widget
                # For demo purposes, we'll auto-apply with logging
                self.apply_cleaning_suggestion(col, missing_data[col])
        
        # Handle duplicates
        if duplicates > 0:
            print(f"\n💡 AI Suggests: Remove {duplicates} duplicate rows")
            self.df = self.df.drop_duplicates()
            self.cleaning_log.append(f"Removed {duplicates} duplicate rows")
            print(f"✅ Applied: Removed duplicates")
        
        print(f"\n📊 Cleaned dataset shape: {self.df.shape}")
        return self.df
    
    def apply_cleaning_suggestion(self, column, missing_count):
        """Apply AI cleaning suggestion with human approval simulation"""
        if self.df[column].dtype in ['object']:
            fill_value = self.df[column].mode()[0]
            self.df[column].fillna(fill_value, inplace=True)
            action = f"Filled {missing_count} missing values in '{column}' with mode: '{fill_value}'"
        else:
            fill_value = self.df[column].median()
            self.df[column].fillna(fill_value, inplace=True)
            action = f"Filled {missing_count} missing values in '{column}' with median: {fill_value:.2f}"
        
        self.cleaning_log.append(action)
        print(f"✅ Applied: {action}")

# Apply HITL cleaning
if 'df' in globals():
    cleaner = HITLDataCleaner(df)
    df_cleaned = cleaner.interactive_cleaning()
    
    print("\n📋 Cleaning Log:")
    for i, action in enumerate(cleaner.cleaning_log, 1):
        print(f"{i}. {action}")
else:
    print("⚠️  Please run the dataset selection cell first!")

## 4. Exploratory Data Analysis (EDA)
**HITL Approach:** AI generates multiple visualization options, human selects the most insightful ones for deeper analysis.

In [None]:
class HITLExplorer:
    """Human-in-the-Loop Exploratory Data Analysis"""
    
    def __init__(self, dataframe, target_column):
        self.df = dataframe
        self.target = target_column
        self.insights = []
    
    def generate_ai_insights(self):
        """AI generates automatic insights about the data"""
        print("🤖 AI-Generated Insights:")
        print("="*40)
        
        # Basic statistics
        print(f"📊 Dataset contains {len(self.df)} samples with {len(self.df.columns)} features")
        
        # Target distribution
        if self.df[self.target].dtype in ['int64', 'float64'] and len(self.df[self.target].unique()) <= 10:
            # Classification
            target_dist = self.df[self.target].value_counts()
            print(f"🎯 Target distribution: {dict(target_dist)}")
            
            # Check for class imbalance
            imbalance_ratio = target_dist.max() / target_dist.min()
            if imbalance_ratio > 2:
                print(f"⚠️  Class imbalance detected (ratio: {imbalance_ratio:.1f})")
                self.insights.append("Consider class balancing techniques")
        else:
            # Regression
            print(f"🎯 Target range: {self.df[self.target].min():.2f} to {self.df[self.target].max():.2f}")
            print(f"🎯 Target mean: {self.df[self.target].mean():.2f} ± {self.df[self.target].std():.2f}")
        
        # Feature correlations
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(numerical_cols) > 1:
            corr_with_target = self.df[numerical_cols].corr()[self.target].abs().sort_values(ascending=False)
            top_corr = corr_with_target.drop(self.target).head(3)
            print(f"🔗 Top correlated features with target:")
            for feature, corr in top_corr.items():
                print(f"   - {feature}: {corr:.3f}")
        
        return self.insights
    
    def interactive_visualization(self):
        """Interactive visualization selection"""
        print("\n📈 Available Visualizations:")
        
        # Create multiple visualization options
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('AI-Suggested Visualizations for Human Review', fontsize=16)
        
        # Plot 1: Target distribution
        if self.df[self.target].dtype in ['int64', 'float64'] and len(self.df[self.target].unique()) <= 10:
            self.df[self.target].value_counts().plot(kind='bar', ax=axes[0,0])
            axes[0,0].set_title('Target Distribution')
        else:
            self.df[self.target].hist(bins=30, ax=axes[0,0])
            axes[0,0].set_title('Target Distribution')
        
        # Plot 2: Correlation heatmap
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns
        if len(numerical_cols) > 2:
            corr_matrix = self.df[numerical_cols].corr()
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[0,1])
            axes[0,1].set_title('Feature Correlations')
        
        # Plot 3: Feature importance preview
        if len(numerical_cols) > 1:
            # Quick feature importance using correlation
            feature_importance = abs(self.df[numerical_cols].corr()[self.target]).drop(self.target).sort_values(ascending=True)
            feature_importance.tail(5).plot(kind='barh', ax=axes[1,0])
            axes[1,0].set_title('Top 5 Feature Correlations')
        
        # Plot 4: Data distribution overview
        if len(numerical_cols) > 2:
            # Box plot of top features
            top_features = abs(self.df[numerical_cols].corr()[self.target]).drop(self.target).nlargest(3).index
            self.df[top_features].boxplot(ax=axes[1,1])
            axes[1,1].set_title('Top Features Distribution')
            axes[1,1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        # Insight summary
        print("\n💡 Human Review Points:")
        print("1. Does the target distribution look reasonable?")
        print("2. Are there any unexpected correlations?")
        print("3. Do you see any patterns that need investigation?")
        print("4. Should we focus on specific features?")

# Run HITL EDA
if 'df_cleaned' in globals():
    explorer = HITLExplorer(df_cleaned, target_column)
    insights = explorer.generate_ai_insights()
    explorer.interactive_visualization()
else:
    print("⚠️  Please run the data cleaning step first!")

## 5. Feature Engineering
**HITL Approach:** AI suggests feature transformations, humans validate and approve the most promising ones.

In [None]:
class HITLFeatureEngineer:
    """Human-in-the-Loop Feature Engineering Assistant"""
    
    def __init__(self, dataframe, target_column, problem_type):
        self.df = dataframe.copy()
        self.target = target_column
        self.problem_type = problem_type
        self.feature_suggestions = []
        self.applied_features = []
    
    def suggest_features(self):
        """AI suggests potential feature engineering operations"""
        print("🤖 AI Feature Engineering Suggestions:")
        print("="*50)
        
        numerical_cols = self.df.select_dtypes(include=[np.number]).columns.drop(self.target)
        categorical_cols = self.df.select_dtypes(include=['object']).columns
        
        suggestions = []
        
        # 1. Polynomial features for numerical columns
        if len(numerical_cols) >= 2:
            suggestions.append({
                'type': 'polynomial',
                'description': f'Create interaction terms between top numerical features',
                'rationale': 'Capture non-linear relationships'
            })
        
        # 2. Binning for continuous variables
        for col in numerical_cols:
            if self.df[col].nunique() > 20:
                suggestions.append({
                    'type': 'binning',
                    'column': col,
                    'description': f'Create categorical bins for {col}',
                    'rationale': 'Convert continuous to categorical for easier interpretation'
                })
        
        # 3. Encoding categorical variables
        if len(categorical_cols) > 0:
            suggestions.append({
                'type': 'encoding',
                'description': f'Encode categorical variables: {list(categorical_cols)}',
                'rationale': 'Convert categorical data for ML algorithms'
            })
        
        # 4. Feature scaling
        suggestions.append({
            'type': 'scaling',
            'description': 'Standardize numerical features',
            'rationale': 'Ensure all features have similar scales'
        })
        
        # 5. Domain-specific features
        if 'age' in numerical_cols:
            suggestions.append({
                'type': 'domain_specific',
                'description': 'Create age groups (young, middle-aged, senior)',
                'rationale': 'Age categories may be more predictive than raw age'
            })
        
        self.feature_suggestions = suggestions
        
        for i, suggestion in enumerate(suggestions, 1):
            print(f"{i}. {suggestion['description']}")
            print(f"   💡 Rationale: {suggestion['rationale']}")
            print()
        
        return suggestions
    
    def apply_feature_engineering(self, selected_suggestions=None):
        """Apply selected feature engineering suggestions"""
        if selected_suggestions is None:
            # Auto-apply all suggestions for demo
            selected_suggestions = list(range(len(self.feature_suggestions)))
        
        df_engineered = self.df.copy()
        
        for idx in selected_suggestions:
            if idx >= len(self.feature_suggestions):
                continue
                
            suggestion = self.feature_suggestions[idx]
            
            if suggestion['type'] == 'polynomial':
                # Create interaction features for top 3 correlated features
                numerical_cols = df_engineered.select_dtypes(include=[np.number]).columns.drop(self.target)
                if len(numerical_cols) >= 2:
                    corr_with_target = df_engineered[numerical_cols].corrwith(df_engineered[self.target]).abs()
                    top_features = corr_with_target.nlargest(2).index.tolist()
                    
                    for i in range(len(top_features)):
                        for j in range(i+1, len(top_features)):
                            feature_name = f"{top_features[i]}_{top_features[j]}_interaction"
                            df_engineered[feature_name] = df_engineered[top_features[i]] * df_engineered[top_features[j]]
                            self.applied_features.append(feature_name)
                    
                    print(f"✅ Created interaction features: {self.applied_features[-len(top_features):]}")
            
            elif suggestion['type'] == 'binning' and 'column' in suggestion:
                col = suggestion['column']
                if col in df_engineered.columns:
                    # Create quantile-based bins
                    df_engineered[f"{col}_binned"] = pd.qcut(df_engineered[col], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
                    self.applied_features.append(f"{col}_binned")
                    print(f"✅ Created bins for {col}")
            
            elif suggestion['type'] == 'encoding':
                # One-hot encode categorical variables
                categorical_cols = df_engineered.select_dtypes(include=['object']).columns
                for col in categorical_cols:
                    if df_engineered[col].nunique() <= 10:  # Only encode if not too many categories
                        dummies = pd.get_dummies(df_engineered[col], prefix=col)
                        df_engineered = pd.concat([df_engineered, dummies], axis=1)
                        df_engineered.drop(col, axis=1, inplace=True)
                        self.applied_features.extend(dummies.columns.tolist())
                
                print(f"✅ One-hot encoded categorical variables")
            
            elif suggestion['type'] == 'scaling':
                # Note: We'll store this for later application during model training
                print(f"✅ Scaling will be applied during model training")
            
            elif suggestion['type'] == 'domain_specific':
                if 'age' in df_engineered.columns:
                    df_engineered['age_group'] = pd.cut(df_engineered['age'], 
                                                       bins=[0, 30, 50, 70, 100], 
                                                       labels=['Young', 'Middle', 'Senior', 'Elderly'])
                    self.applied_features.append('age_group')
                    print(f"✅ Created age groups")
        
        print(f"\n📊 Original features: {self.df.shape[1]}")
        print(f"📊 Engineered features: {df_engineered.shape[1]}")
        print(f"📊 New features created: {len(self.applied_features)}")
        
        return df_engineered
    
    def feature_importance_preview(self, df_engineered):
        """Preview feature importance of engineered features"""
        numerical_cols = df_engineered.select_dtypes(include=[np.number]).columns
        if self.target in numerical_cols and len(numerical_cols) > 1:
            correlations = df_engineered[numerical_cols].corrwith(df_engineered[self.target]).abs().sort_values(ascending=False)
            
            print("\n🔗 Feature Correlations with Target (Top 10):")
            print("-" * 40)
            for feature, corr in correlations.drop(self.target).head(10).items():
                is_new = "🆕" if feature in self.applied_features else "   "
                print(f"{is_new} {feature}: {corr:.3f}")

# Apply feature engineering
if 'df_cleaned' in globals():
    engineer = HITLFeatureEngineer(df_cleaned, target_column, problem_type)
    suggestions = engineer.suggest_features()
    
    print("\n🧠 Human Decision: Apply all AI suggestions? (Auto-applying for demo)")
    df_engineered = engineer.apply_feature_engineering()
    engineer.feature_importance_preview(df_engineered)
else:
    print("⚠️  Please run the previous steps first!")

## 6. Data Splitting & Model Selection
**HITL Approach:** AI recommends optimal train/test split and suggests appropriate algorithms based on data characteristics.

In [None]:
class HITLModelSelector:
    """Human-in-the-Loop Model Selection Assistant"""
    
    def __init__(self, dataframe, target_column, problem_type):
        self.df = dataframe
        self.target = target_column
        self.problem_type = problem_type
        self.models = {}
        self.results = {}
    
    def analyze_data_characteristics(self):
        """AI analyzes data to recommend appropriate models"""
        print("🤖 AI Data Analysis for Model Recommendation:")
        print("="*55)
        
        n_samples, n_features = self.df.shape
        n_features -= 1  # Subtract target column
        
        print(f"📊 Dataset size: {n_samples} samples, {n_features} features")
        print(f"📊 Problem type: {self.problem_type}")
        
        # Data characteristics
        characteristics = {
            'small_dataset': n_samples < 1000,
            'high_dimensional': n_features > n_samples * 0.1,
            'binary_classification': self.problem_type == 'classification' and len(self.df[self.target].unique()) == 2,
            'multiclass': self.problem_type == 'classification' and len(self.df[self.target].unique()) > 2,
            'regression': self.problem_type == 'regression'
        }
        
        print(f"📊 Small dataset: {characteristics['small_dataset']}")
        print(f"📊 High dimensional: {characteristics['high_dimensional']}")
        
        return characteristics
    
    def recommend_models(self, characteristics):
        """AI recommends models based on data characteristics"""
        print("\n💡 AI Model Recommendations:")
        print("-" * 30)
        
        recommendations = []
        
        if self.problem_type == 'classification':
            recommendations.extend([
                {
                    'name': 'Random Forest',
                    'model': RandomForestClassifier(random_state=42),
                    'rationale': 'Robust, handles mixed data types, feature importance',
                    'priority': 'High'
                },
                {
                    'name': 'Logistic Regression',
                    'model': LogisticRegression(random_state=42, max_iter=1000),
                    'rationale': 'Interpretable, good baseline, fast training',
                    'priority': 'High'
                }
            ])
            
            if not characteristics['small_dataset']:
                recommendations.append({
                    'name': 'Support Vector Machine',
                    'model': SVC(random_state=42, probability=True),
                    'rationale': 'Good for complex decision boundaries',
                    'priority': 'Medium'
                })
        
        else:  # regression
            recommendations.extend([
                {
                    'name': 'Random Forest',
                    'model': RandomForestRegressor(random_state=42),
                    'rationale': 'Robust, non-linear relationships, feature importance',
                    'priority': 'High'
                },
                {
                    'name': 'Linear Regression',
                    'model': LinearRegression(),
                    'rationale': 'Interpretable, fast, good baseline',
                    'priority': 'High'
                }
            ])
            
            if not characteristics['small_dataset']:
                recommendations.append({
                    'name': 'Support Vector Regression',
                    'model': SVR(),
                    'rationale': 'Good for non-linear relationships',
                    'priority': 'Medium'
                })
        
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. {rec['name']} [{rec['priority']} Priority]")
            print(f"   💡 {rec['rationale']}")
        
        return recommendations
    
    def prepare_data(self):
        """Prepare data for modeling with human oversight"""
        print("\n🔧 Data Preparation:")
        print("-" * 20)
        
        # Separate features and target
        X = self.df.drop(columns=[self.target])
        y = self.df[self.target]
        
        # Handle categorical variables
        categorical_cols = X.select_dtypes(include=['object']).columns
        if len(categorical_cols) > 0:
            print(f"🔄 Encoding {len(categorical_cols)} categorical columns")
            # Simple label encoding for demo (in practice, might use one-hot)
            le = LabelEncoder()
            for col in categorical_cols:
                X[col] = le.fit_transform(X[col].astype(str))
        
        # Feature scaling
        print("📏 Applying feature scaling")
        scaler = StandardScaler()
        X_scaled = pd.DataFrame(
            scaler.fit_transform(X),
            columns=X.columns,
            index=X.index
        )
        
        # Train-test split
        test_size = 0.2 if len(self.df) > 100 else 0.3
        print(f"🔀 Splitting data: {int((1-test_size)*100)}% train, {int(test_size*100)}% test")
        
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=test_size, random_state=42, stratify=y if self.problem_type == 'classification' else None
        )
        
        print(f"✅ Training set: {X_train.shape[0]} samples")
        print(f"✅ Testing set: {X_test.shape[0]} samples")
        
        return X_train, X_test, y_train, y_test, scaler
    
    def train_and_evaluate_models(self, X_train, X_test, y_train, y_test, model_recommendations):
        """Train multiple models and let human review results"""
        print("\n🚀 Training Models:")
        print("="*25)
        
        results = {}
        
        for rec in model_recommendations:
            model_name = rec['name']
            model = rec['model']
            
            print(f"\n🔄 Training {model_name}...")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Make predictions
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            if self.problem_type == 'classification':
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                f1 = f1_score(y_test, y_pred, average='weighted')
                
                results[model_name] = {
                    'model': model,
                    'accuracy': accuracy,
                    'precision': precision,
                    'recall': recall,
                    'f1_score': f1,
                    'predictions': y_pred
                }
                
                print(f"   📊 Accuracy: {accuracy:.3f}")
                print(f"   📊 Precision: {precision:.3f}")
                print(f"   📊 Recall: {recall:.3f}")
                print(f"   📊 F1-Score: {f1:.3f}")
                
            else:  # regression
                mse = mean_squared_error(y_test, y_pred)
                rmse = np.sqrt(mse)
                r2 = r2_score(y_test, y_pred)
                
                results[model_name] = {
                    'model': model,
                    'mse': mse,
                    'rmse': rmse,
                    'r2_score': r2,
                    'predictions': y_pred
                }
                
                print(f"   📊 RMSE: {rmse:.3f}")
                print(f"   📊 R² Score: {r2:.3f}")
                print(f"   📊 MSE: {mse:.3f}")
        
        return results

# Execute model selection pipeline
if 'df_engineered' in globals():
    selector = HITLModelSelector(df_engineered, target_column, problem_type)
    
    # Step 1: Analyze data characteristics
    characteristics = selector.analyze_data_characteristics()
    
    # Step 2: Get AI recommendations
    model_recommendations = selector.recommend_models(characteristics)
    
    # Step 3: Prepare data
    X_train, X_test, y_train, y_test, scaler = selector.prepare_data()
    
    # Step 4: Train and evaluate models
    model_results = selector.train_and_evaluate_models(X_train, X_test, y_train, y_test, model_recommendations)
    
    print("\n🏆 Human Review: Which model performs best for your use case?")
    
else:
    print("⚠️  Please run the feature engineering step first!")

## 7. Human-in-the-Loop Model Validation
**HITL Approach:** Interactive model evaluation where humans can validate predictions, identify edge cases, and provide feedback for model improvement.

In [None]:
class HITLValidator:
    """Human-in-the-Loop Model Validation System"""
    
    def __init__(self, models_dict, X_test, y_test, problem_type):
        self.models = models_dict
        self.X_test = X_test
        self.y_test = y_test
        self.problem_type = problem_type
        self.validation_feedback = []
        self.edge_cases = []
    
    def interactive_prediction_review(self, model_name, n_samples=10):
        """Human reviews individual predictions"""
        print(f"🔍 Human Validation: Reviewing {model_name} Predictions")
        print("="*60)
        
        if model_name not in self.models:
            print("❌ Model not found!")
            return
        
        model_info = self.models[model_name]
        model = model_info['model']
        predictions = model_info['predictions']
        
        # Get confidence scores if available
        if hasattr(model, 'predict_proba') and self.problem_type == 'classification':
            probabilities = model.predict_proba(self.X_test)
            confidence_scores = np.max(probabilities, axis=1)
        else:
            confidence_scores = np.ones(len(predictions))  # Default confidence
        
        # Sample cases for review (focus on low confidence and edge cases)
        review_indices = self._select_review_cases(predictions, confidence_scores, n_samples)
        
        print(f"📋 Reviewing {len(review_indices)} cases (selected by AI for human review):\")\n"
        
        correct_predictions = 0
        for i, idx in enumerate(review_indices, 1):
            actual = self.y_test.iloc[idx]
            predicted = predictions[idx]
            confidence = confidence_scores[idx]
            
            print(f"Case {i}/{len(review_indices)}:")
            print(f"   🎯 Actual: {actual}")
            print(f"   🤖 Predicted: {predicted}")
            print(f"   📊 Confidence: {confidence:.3f}")
            
            # Show some feature values for context
            feature_sample = self.X_test.iloc[idx].head(3)
            print(f"   📈 Key features: {dict(feature_sample)}")
            
            # Human validation (simulated)
            is_correct = (actual == predicted) if self.problem_type == 'classification' else abs(actual - predicted) < abs(actual * 0.1)
            human_agrees = np.random.choice([True, False], p=[0.9 if is_correct else 0.3, 0.1 if is_correct else 0.7])
            
            if human_agrees:
                print(f"   ✅ Human validation: CORRECT")
                correct_predictions += 1
            else:
                print(f"   ❌ Human validation: INCORRECT")
                self.edge_cases.append({
                    'index': idx,
                    'actual': actual,
                    'predicted': predicted,
                    'confidence': confidence,
                    'features': dict(self.X_test.iloc[idx])
                })
            
            print(f"   💬 AI Learning: {'Reinforcing prediction' if human_agrees else 'Flagging for improvement'}")
            print()
        
        human_accuracy = correct_predictions / len(review_indices)
        print(f"🏆 Human-Validated Accuracy: {human_accuracy:.3f}")
        print(f"🚨 Edge cases identified: {len([case for case in self.edge_cases if case['index'] in review_indices])}")
        
        return human_accuracy
    
    def _select_review_cases(self, predictions, confidence_scores, n_samples):
        """AI selects most important cases for human review"""
        # Prioritize low confidence predictions
        low_confidence_indices = np.where(confidence_scores < 0.7)[0]
        
        # Add some random samples
        random_indices = np.random.choice(len(predictions), size=min(n_samples//2, len(predictions)), replace=False)
        
        # Combine and select unique indices
        review_indices = np.unique(np.concatenate([
            low_confidence_indices[:n_samples//2],
            random_indices
        ]))[:n_samples]
        
        return review_indices
    
    def analyze_edge_cases(self):
        """Analyze patterns in edge cases for model improvement"""
        if not self.edge_cases:
            print("🎉 No edge cases found! Model performing well.")
            return
        
        print(f"🔬 Edge Case Analysis ({len(self.edge_cases)} cases):")
        print("="*50)
        
        # Analyze confidence distribution of edge cases
        edge_confidences = [case['confidence'] for case in self.edge_cases]
        avg_confidence = np.mean(edge_confidences)
        
        print(f"📊 Average confidence in edge cases: {avg_confidence:.3f}")
        
        if avg_confidence < 0.5:
            print("💡 Insight: Model is appropriately uncertain about difficult cases")
        else:
            print("⚠️  Insight: Model overconfident in incorrect predictions")
        
        # Feature analysis for edge cases
        if len(self.edge_cases) > 2:
            print("\\n🔍 Common patterns in edge cases:")
            
            # This is a simplified analysis - in practice, you'd do more sophisticated pattern detection
            edge_case_features = pd.DataFrame([case['features'] for case in self.edge_cases])
            
            for col in edge_case_features.select_dtypes(include=[np.number]).columns[:3]:
                mean_val = edge_case_features[col].mean()
                overall_mean = self.X_test[col].mean()
                diff = abs(mean_val - overall_mean) / overall_mean if overall_mean != 0 else 0
                
                if diff > 0.2:
                    print(f"   📈 {col}: Edge cases have different distribution (diff: {diff:.2f})")
        
        return self.edge_cases
    
    def generate_improvement_suggestions(self):
        """AI generates suggestions for model improvement based on human feedback"""
        print("\\n🚀 AI Improvement Suggestions:")
        print("="*40)
        
        suggestions = []
        
        if len(self.edge_cases) > len(self.y_test) * 0.1:
            suggestions.append("Consider collecting more training data for underrepresented cases")
        
        if any(case['confidence'] > 0.8 for case in self.edge_cases):
            suggestions.append("Implement confidence calibration to improve prediction reliability")
        
        if len(self.edge_cases) > 0:
            suggestions.append("Use active learning to focus on similar challenging cases")
            suggestions.append("Consider ensemble methods to improve robustness")
        
        suggestions.append("Implement continuous learning pipeline with human feedback")
        suggestions.append("Set up monitoring for model drift in production")
        
        for i, suggestion in enumerate(suggestions, 1):
            print(f"{i}. {suggestion}")
        
        return suggestions

# Run Human-in-the-Loop Validation
if 'model_results' in globals() and len(model_results) > 0:
    # Select best performing model for detailed validation
    if problem_type == 'classification':
        best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['f1_score'])
        print(f"🏆 Best Model Selected: {best_model_name} (F1-Score: {model_results[best_model_name]['f1_score']:.3f})")
    else:
        best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['r2_score'])
        print(f"🏆 Best Model Selected: {best_model_name} (R²: {model_results[best_model_name]['r2_score']:.3f})")
    
    # Initialize validator
    validator = HITLValidator(model_results, X_test, y_test, problem_type)
    
    # Human validation of predictions
    human_accuracy = validator.interactive_prediction_review(best_model_name, n_samples=8)
    
    # Analyze edge cases
    edge_cases = validator.analyze_edge_cases()
    
    # Generate improvement suggestions
    suggestions = validator.generate_improvement_suggestions()
    
else:
    print("⚠️  Please run the model training step first!")

## 8. Final Results & HITL Dashboard
**HITL Summary:** Comprehensive dashboard showing the collaboration between human expertise and AI capabilities throughout the analysis pipeline.

## 9. Energy Optimization with Real Data
**HITL Approach:** Use real-world sustainable energy data to predict and optimize energy usage. AI provides recommendations, and human experts validate and adjust the optimization strategy.

In [None]:
# Load Global Sustainable Energy Data (2000-2020)
# For demo, we simulate loading a real dataset. Replace with actual file path as needed.

try:
    # Example: df_energy = pd.read_csv('global_sustainable_energy_2000_2020.csv')
    # Simulate real data structure
    years = np.arange(2000, 2021)
    countries = ['USA', 'Germany', 'China', 'India', 'Brazil']
    data = []
    for country in countries:
        for year in years:
            data.append({
                'Country': country,
                'Year': year,
                'Total_Energy_Consumption': np.random.uniform(1000, 10000),
                'Renewable_Energy_Share': np.random.uniform(10, 60),
                'Population': np.random.uniform(10, 350) * 1e6,
                'GDP_per_Capita': np.random.uniform(5000, 60000)
            })
    df_energy = pd.DataFrame(data)
    print("✅ Real-world sustainable energy data loaded (simulated)")
    display(df_energy.head())
except Exception as e:
    print(f"❌ Error loading energy data: {e}")

### HITL Energy Optimization Pipeline
- **AI:** Forecasts future energy consumption and recommends optimal renewable share.
- **Human:** Reviews forecasts, validates recommendations, and adjusts targets based on policy or local knowledge.

In [None]:
# Select a country for optimization
from ipywidgets import interact

@interact(country=df_energy['Country'].unique())
def select_country(country):
    df_country = df_energy[df_energy['Country'] == country].copy()
    display(df_country.head())
    
    # Plot energy consumption and renewable share
    fig, ax1 = plt.subplots(figsize=(10,5))
    ax2 = ax1.twinx()
    ax1.plot(df_country['Year'], df_country['Total_Energy_Consumption'], 'g-', label='Total Energy Consumption')
    ax2.plot(df_country['Year'], df_country['Renewable_Energy_Share'], 'b--', label='Renewable Share (%)')
    ax1.set_xlabel('Year')
    ax1.set_ylabel('Total Energy Consumption', color='g')
    ax2.set_ylabel('Renewable Energy Share (%)', color='b')
    plt.title(f'Energy Profile for {country}')
    fig.tight_layout()
    plt.show()
    
    # Store for next steps
    global df_selected_country
    df_selected_country = df_country

In [None]:
# Forecast future energy consumption using a simple regression model
from sklearn.linear_model import LinearRegression

if 'df_selected_country' in globals():
    df = df_selected_country.copy()
    X = df[['Year']]
    y = df['Total_Energy_Consumption']
    model = LinearRegression()
    model.fit(X, y)
    
    # Predict next 5 years
    future_years = np.arange(df['Year'].max()+1, df['Year'].max()+6)
    y_pred = model.predict(future_years.reshape(-1,1))
    
    # Plot
    plt.figure(figsize=(8,4))
    plt.plot(df['Year'], y, label='Historical Consumption')
    plt.plot(future_years, y_pred, 'r--', label='Forecasted Consumption')
    plt.xlabel('Year')
    plt.ylabel('Total Energy Consumption')
    plt.title(f'Forecasted Energy Consumption for {df["Country"].iloc[0]}')
    plt.legend()
    plt.show()
    
    print("AI Suggestion: Consider increasing renewable share if forecasted consumption rises.")
else:
    print("Please select a country above.")

In [None]:
# Human-in-the-Loop: Set renewable energy target and review AI recommendation
import ipywidgets as widgets

def hitl_renewable_target():
    if 'df_selected_country' not in globals():
        print("Please select a country above.")
        return
    
    last_year = df_selected_country['Year'].max()
    last_share = df_selected_country[df_selected_country['Year'] == last_year]['Renewable_Energy_Share'].values[0]
    print(f"Current renewable share in {last_year}: {last_share:.1f}%")
    
    ai_suggested_target = min(last_share + 5, 100)
    print(f"AI Suggestion: Set next 5-year renewable target to {ai_suggested_target:.1f}%")
    
    target_slider = widgets.FloatSlider(
        value=ai_suggested_target,
        min=last_share,
        max=100,
        step=0.5,
        description='Set Target %:',
        continuous_update=False
    )
    display(target_slider)
    
    def on_value_change(change):
        print(f"Human-adjusted target: {change['new']:.1f}%")
        if change['new'] > ai_suggested_target:
            print("Human: Ambitious target! Consider grid/storage upgrades.")
        elif change['new'] < ai_suggested_target:
            print("Human: Conservative target. Review policy or local constraints.")
        else:
            print("Human: Accepting AI recommendation.")
    target_slider.observe(on_value_change, names='value')

hitl_renewable_target()

In [None]:
# Final HITL Dashboard and Summary
def create_hitl_dashboard():
    """Create a comprehensive dashboard of the HITL process"""
    
    print("🎯 HUMAN-IN-THE-LOOP DATA ANALYSIS DASHBOARD")
    print("="*60)
    print("🤖 AI + 👥 Human Collaboration Summary")
    print("="*60)
    
    # Project Overview
    print("\\n📊 PROJECT OVERVIEW:")
    print("-" * 25)
    print(f"🎯 Problem Type: {problem_type.title()}")
    print(f"📈 Dataset: {df.shape[0]} samples, {df.shape[1]} features")
    print(f"🎲 Target Variable: {target_column}")
    
    # HITL Process Summary
    print("\\n🔄 HITL PROCESS STAGES:")
    print("-" * 30)
    
    stages = [
        "1. 🤖 AI Dataset Generation → 👥 Human Track Selection",
        "2. 🤖 AI Quality Assessment → 👥 Human Cleaning Approval", 
        "3. 🤖 AI Insight Generation → 👥 Human EDA Review",
        "4. 🤖 AI Feature Suggestions → 👥 Human Feature Validation",
        "5. 🤖 AI Model Recommendations → 👥 Human Model Selection",
        "6. 🤖 AI Predictions → 👥 Human Validation & Feedback"
    ]
    
    for stage in stages:
        print(f"   {stage}")
    
    # Model Performance Summary
    if 'model_results' in globals() and len(model_results) > 0:
        print("\\n🏆 MODEL PERFORMANCE SUMMARY:")
        print("-" * 35)
        
        for model_name, results in model_results.items():
            print(f"\\n🔹 {model_name}:")
            if problem_type == 'classification':
                print(f"   📊 Accuracy: {results['accuracy']:.3f}")
                print(f"   📊 F1-Score: {results['f1_score']:.3f}")
                print(f"   📊 Precision: {results['precision']:.3f}")
            else:
                print(f"   📊 R² Score: {results['r2_score']:.3f}")
                print(f"   📊 RMSE: {results['rmse']:.3f}")
    
    # HITL Value Demonstration
    print("\\n💡 HITL VALUE PROPOSITION:")
    print("-" * 32)
    
    hitl_benefits = [
        "🎯 Domain Expertise: Humans provide context AI lacks",
        "🔍 Quality Control: Human validation of AI suggestions",
        "🚨 Edge Case Detection: Human identification of outliers",
        "📈 Continuous Learning: AI improves from human feedback",
        "⚖️  Ethical Oversight: Human judgment on sensitive decisions",
        "🔄 Iterative Improvement: Human-guided model refinement"
    ]
    
    for benefit in hitl_benefits:
        print(f"   {benefit}")
    
    # Real-world Applications
    print(f"\\n🌍 REAL-WORLD APPLICATIONS ({dataset_type} Track):")
    print("-" * 45)
    
    if 'dataset_type' in globals():
        if dataset_type == 'Healthcare':
            applications = [
                "🏥 Medical Diagnosis: AI suggests, doctors validate",
                "💊 Drug Discovery: AI identifies compounds, experts review",
                "📱 Patient Monitoring: AI detects anomalies, nurses confirm",
                "🔬 Research Analysis: AI finds patterns, researchers interpret"
            ]
        else:  # Sustainability
            applications = [
                "🌱 Energy Optimization: AI recommends, engineers approve",
                "🌊 Environmental Monitoring: AI alerts, scientists validate",
                "♻️  Waste Management: AI classifies, operators confirm",
                "🌡️  Climate Modeling: AI predicts, experts interpret"
            ]
        
        for app in applications:
            print(f"   {app}")
    
    # Next Steps
    print("\\n🚀 RECOMMENDED NEXT STEPS:")
    print("-" * 30)
    
    next_steps = [
        "🔄 Implement continuous feedback loop",
        "📊 Deploy model with human oversight dashboard",
        "🎓 Train domain experts on AI collaboration",
        "📈 Monitor model performance and human satisfaction",
        "🔧 Iterate based on real-world feedback",
        "📚 Document HITL best practices"
    ]
    
    for step in next_steps:
        print(f"   {step}")
    
    print("\\n" + "="*60)
    print("🎉 HACKATHON PROJECT COMPLETE!")
    print("🏆 Human-AI Collaboration Successfully Demonstrated")
    print("="*60)

# Create final visualization
def create_final_visualization():
    """Create a final summary visualization"""
    if 'model_results' not in globals() or len(model_results) == 0:
        return
    
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('HITL Data Analysis Pipeline - Final Results', fontsize=16, fontweight='bold')
    
    # Plot 1: Model Comparison
    model_names = list(model_results.keys())
    if problem_type == 'classification':
        scores = [model_results[name]['f1_score'] for name in model_names]
        metric_name = 'F1-Score'
    else:
        scores = [model_results[name]['r2_score'] for name in model_names]
        metric_name = 'R² Score'
    
    bars = ax1.bar(model_names, scores, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
    ax1.set_title(f'Model Comparison ({metric_name})')
    ax1.set_ylabel(metric_name)
    ax1.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, score in zip(bars, scores):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{score:.3f}', ha='center', va='bottom')
    
    # Plot 2: Feature Importance (if available)
    if 'df_engineered' in globals() and hasattr(model_results[model_names[0]]['model'], 'feature_importances_'):
        best_model = model_results[model_names[0]]['model']
        feature_names = [col for col in df_engineered.columns if col != target_column]
        if hasattr(best_model, 'feature_importances_'):
            importances = best_model.feature_importances_
            # Get top 10 features
            top_indices = np.argsort(importances)[-10:]
            top_features = [feature_names[i] for i in top_indices]
            top_importances = importances[top_indices]
            
            ax2.barh(range(len(top_features)), top_importances, color='#96CEB4')
            ax2.set_yticks(range(len(top_features)))
            ax2.set_yticklabels(top_features)
            ax2.set_title('Top 10 Feature Importances')
            ax2.set_xlabel('Importance')
    else:
        ax2.text(0.5, 0.5, 'Feature Importance\\nNot Available', 
                ha='center', va='center', transform=ax2.transAxes, fontsize=12)
        ax2.set_title('Feature Importance')
    
    # Plot 3: Prediction Distribution
    best_model_name = model_names[0]  # Assume first is best for simplicity
    predictions = model_results[best_model_name]['predictions']
    
    ax3.hist(predictions, bins=20, alpha=0.7, color='#F7DC6F', edgecolor='black')
    ax3.set_title('Prediction Distribution')
    ax3.set_xlabel('Predicted Values')
    ax3.set_ylabel('Frequency')
    
    # Plot 4: HITL Process Flow
    ax4.axis('off')
    process_text = """
    HITL Process Flow:
    
    1. 🤖 AI Analysis
    2. 👥 Human Review  
    3. 🔄 Feedback Loop
    4. 📈 Improvement
    5. ✅ Validation
    6. 🚀 Deployment
    
    Key Benefits:
    • Increased Accuracy
    • Domain Expertise
    • Ethical Oversight
    • Continuous Learning
    """
    
    ax4.text(0.1, 0.9, process_text, transform=ax4.transAxes, fontsize=11,
            verticalalignment='top', fontfamily='monospace',
            bbox=dict(boxstyle="round,pad=0.3", facecolor="lightblue", alpha=0.7))
    
    plt.tight_layout()
    plt.show()

# Execute Final Dashboard
create_hitl_dashboard()
create_final_visualization()

# Save results for submission
if 'model_results' in globals():
    print("\\n💾 Saving results for hackathon submission...")
    
    submission_summary = {
        'project_theme': 'Human-in-the-Loop (HITL)',
        'problem_type': problem_type,
        'dataset_shape': df.shape if 'df' in globals() else 'Unknown',
        'best_model': max(model_results.keys(), 
                         key=lambda x: model_results[x]['f1_score'] if problem_type == 'classification' 
                                     else model_results[x]['r2_score']),
        'hitl_stages_completed': 6,
        'human_ai_collaboration_demonstrated': True,
        'ready_for_production': True
    }
    
    print("✅ Project ready for hackathon submission!")
    print("🎯 HITL pipeline successfully demonstrated!")
    
else:
    print("⚠️  Run all previous cells to complete the analysis!")