# Executive EDA for CareerLens Datasets

This notebook performs a high-level Exploratory Data Analysis (EDA) on the datasets loaded from the `data_catalog`. The goal is to quickly assess data quality, identify potential features and targets, and generate a concise action plan for preprocessing and feature engineering.

In [None]:
import sys
import pandas as pd
from pathlib import Path

# Add the project root to the Python path to allow importing from ml.pipelines
project_root = Path().resolve().parents[1]
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from ml.pipelines.data_catalog import load_all_datasets

# Load all datasets from the catalog
datasets = load_all_datasets()

print(f"Loaded {len(datasets)} datasets: {', '.join(datasets.keys())}")

## Analysis Functions

Helper functions to perform the EDA steps for each dataframe.

In [None]:
from typing import Dict, Any, List

def analyze_dataframe(df_name: str, df: pd.DataFrame) -> Dict[str, Any]:
    """Performs a comprehensive EDA on a given DataFrame."""
    
    report = {
        "name": df_name,
        "shape": df.shape,
        "duplicates": df.duplicated().sum(),
        "columns": []
    }
    
    total_rows = len(df)
    
    for col in df.columns:
        col_report = {}
        
        # Column Types and Missing %
        col_report['type'] = str(df[col].dtype)
        missing_count = df[col].isnull().sum()
        col_report['missing_pct'] = round((missing_count / total_rows) * 100, 2) if total_rows > 0 else 0

        # Obvious Label/Target Columns
        if 'career' in col.lower() or 'path' in col.lower() or 'role' in col.lower() or 'title' in col.lower():
            col_report['is_potential_target'] = True
        else:
            col_report['is_potential_target'] = False

        # Categorical Cardinality
        if df[col].dtype == 'object' or pd.api.types.is_categorical_dtype(df[col]):
            col_report['cardinality'] = df[col].nunique()

        # Numeric Outliers
        if pd.api.types.is_numeric_dtype(df[col]):
            col_report['stats'] = df[col].describe().to_dict()
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
            col_report['outlier_count'] = len(outliers)

        report['columns'].append({col: col_report})
        
    return report

def generate_summary_markdown(report: Dict[str, Any]) -> str:
    """Generates a markdown summary from the analysis report."""
    
    markdown = f"### Dataset: `{report['name']}`\n\n"
    markdown += f"- **Shape**: {report['shape'][0]} rows, {report['shape'][1]} columns\n"
    markdown += f"- **Duplicates**: {report['duplicates']} rows\n\n"
    
    keep = []
    drop = []
    impute = []
    engineer = []
    
    markdown += "| Column | Type | Missing % | Cardinality | Outliers | Notes |\n"
    markdown += "|--------|------|-----------|-------------|----------|-------|\n"
    
    for col_data in report['columns']:
        for col_name, col_report in col_data.items():
            cardinality = col_report.get('cardinality', 'N/A')
            outliers = col_report.get('outlier_count', 'N/A')
            notes = []
            
            if col_report.get('is_potential_target'):
                notes.append("Potential target.")
                engineer.append(f"`{col_name}` (as target)")

            if col_report['missing_pct'] > 50:
                notes.append("High missing %.")
                drop.append(f"`{col_name}`")
            elif col_report['missing_pct'] > 0:
                notes.append("Needs imputation.")
                impute.append(f"`{col_name}`")
            
            if isinstance(cardinality, int) and cardinality > 50 and not col_report.get('is_potential_target'):
                notes.append("High cardinality.")
                engineer.append(f"`{col_name}` (embedding/grouping)")

            if isinstance(outliers, int) and outliers > 0:
                notes.append(f"{outliers} outliers detected.")

            if not notes:
                notes.append("Looks good.")
                keep.append(f"`{col_name}`")

            markdown += f"| `{col_name}` | {col_report['type']} | {col_report['missing_pct']}% | {cardinality} | {outliers} | {' '.join(notes)} |\n"

    markdown += "\n#### Recommendations:\n"
    markdown += f"- **Keep**: {', '.join(keep)}\n"
    markdown += f"- **Drop**: {', '.join(drop)}\n"
    markdown += f"- **Impute**: {', '.join(impute)}\n"
    markdown += f"- **Engineer**: {', '.join(engineer)}\n"
    
    markdown += "\n#### Hypotheses:\n"
    markdown += "- **Career Path**: Potential target columns seem to be related to job titles. These can be cleaned and used as labels.\n"
    markdown += "- **Skills**: Columns with high cardinality text might contain skill descriptions. These can be processed with TF-IDF or embeddings to create a skills feature set.\n"

    return markdown

## Run Analysis and Generate Reports

Iterate through each loaded dataset, perform the EDA, and display the summary report.

In [None]:
from IPython.display import display, Markdown

for name, df in datasets.items():
    analysis_report = analyze_dataframe(name, df)
    markdown_summary = generate_summary_markdown(analysis_report)
    display(Markdown(markdown_summary))