# Portfolio-Quality Visualizations

This notebook creates publication-ready visualizations for portfolio and presentations.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import joblib
import sys
sys.path.append('..')
from src.models.data_loader import DataLoader
from sklearn.metrics import confusion_matrix

# Set style for professional plots
sns.set_style("whitegrid")
plt.rcParams['figure.dpi'] = 300
plt.rcParams['font.size'] = 11
plt.rcParams['font.family'] = 'sans-serif'

# Color palette
COLORS = {
    'primary': '#2E86AB',
    'secondary': '#A23B72',
    'accent': '#F18F01',
    'success': '#06A77D',
    'warning': '#F77F00',
    'error': '#D62828'
}

# Load data
loader = DataLoader()
train_df, val_df, test_df = loader.load_data()

code_smells = [
    'has_long_method',
    'has_high_complexity',
    'has_too_many_params',
    'has_deep_nesting',
    'has_no_docstring'
]

## Visualization 1: Project Overview Infographic

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('CodeGuard Project: AI-Powered Code Quality Analysis', 
             fontsize=20, fontweight='bold', y=0.98)

# Dataset Stats
ax = axes[0, 0]
ax.axis('off')
ax.text(0.5, 0.9, 'Dataset', fontsize=16, ha='center', fontweight='bold')
ax.text(0.5, 0.7, '1,624', fontsize=32, ha='center', color=COLORS['primary'], fontweight='bold')
ax.text(0.5, 0.55, 'Python Functions', fontsize=12, ha='center')
ax.text(0.5, 0.4, '10 GitHub Repos', fontsize=12, ha='center')
ax.text(0.5, 0.3, 'Clean & Labeled', fontsize=10, ha='center', style='italic')

# Features
ax = axes[0, 1]
ax.axis('off')
ax.text(0.5, 0.9, 'Features', fontsize=16, ha='center', fontweight='bold')
ax.text(0.5, 0.7, '37', fontsize=32, ha='center', color=COLORS['accent'], fontweight='bold')
ax.text(0.5, 0.55, 'Engineered Features', fontsize=12, ha='center')
ax.text(0.5, 0.4, '5 Categories', fontsize=12, ha='center')
ax.text(0.5, 0.3, 'Complexity, Structure, Style', fontsize=10, ha='center', style='italic')

# Models
ax = axes[0, 2]
ax.axis('off')
ax.text(0.5, 0.9, 'Models Trained', fontsize=16, ha='center', fontweight='bold')
ax.text(0.5, 0.7, '35+', fontsize=32, ha='center', color=COLORS['success'], fontweight='bold')
ax.text(0.5, 0.55, 'ML Experiments', fontsize=12, ha='center')
ax.text(0.5, 0.4, 'XGBoost, RF, Ensemble', fontsize=12, ha='center')
ax.text(0.5, 0.3, 'Tracked in MLflow', fontsize=10, ha='center', style='italic')

# Performance
ax = axes[1, 0]
ax.axis('off')
ax.text(0.5, 0.9, 'Performance', fontsize=16, ha='center', fontweight='bold')
ax.text(0.5, 0.7, '96.15%', fontsize=32, ha='center', color=COLORS['success'], fontweight='bold')
ax.text(0.5, 0.55, 'F1 Score (Ensemble)', fontsize=12, ha='center')
ax.text(0.5, 0.4, '+0.82% vs Baseline', fontsize=12, ha='center')
ax.text(0.5, 0.3, 'Ready for CodeBERT', fontsize=10, ha='center', style='italic')

# Code Smells
ax = axes[1, 1]
ax.axis('off')
ax.text(0.5, 0.9, 'Code Smells', fontsize=16, ha='center', fontweight='bold')
ax.text(0.5, 0.7, '5', fontsize=32, ha='center', color=COLORS['warning'], fontweight='bold')
ax.text(0.5, 0.55, 'Types Detected', fontsize=12, ha='center')
ax.text(0.5, 0.4, 'Long Method, High Complexity,', fontsize=10, ha='center')
ax.text(0.5, 0.3, 'Too Many Params, Deep Nesting,', fontsize=10, ha='center')
ax.text(0.5, 0.2, 'No Docstring', fontsize=10, ha='center')

# Timeline
ax = axes[1, 2]
ax.axis('off')
ax.text(0.5, 0.9, 'Timeline', fontsize=16, ha='center', fontweight='bold')
ax.text(0.5, 0.7, '7 Days', fontsize=32, ha='center', color=COLORS['primary'], fontweight='bold')
ax.text(0.5, 0.55, 'Week 1-2 Complete', fontsize=12, ha='center')
ax.text(0.5, 0.4, '56 Total Days Planned', fontsize=12, ha='center')
ax.text(0.5, 0.3, 'Next: CodeBERT Training', fontsize=10, ha='center', style='italic')

plt.tight_layout()
plt.savefig('../results/portfolio/01_project_overview.png', bbox_inches='tight', dpi=300)
print("‚úÖ Saved: 01_project_overview.png")
plt.show()

## Visualization 2: Model Performance Comparison

In [None]:
# Load results
baseline_df = pd.read_csv('../results/baseline_results.csv')
ensemble_df = pd.read_csv('../results/ensemble_results.csv')

fig, ax = plt.subplots(figsize=(14, 8))

smells = baseline_df['smell'].tolist()
baseline_f1 = baseline_df['val_f1'].tolist()
ensemble_f1 = ensemble_df[ensemble_df['ensemble_type'] == 'voting']['val_f1'].tolist()

x = np.arange(len(smells))
width = 0.35

bars1 = ax.barh(x - width/2, baseline_f1, width, label='Baseline XGBoost', 
                color=COLORS['primary'], alpha=0.8)
bars2 = ax.barh(x + width/2, ensemble_f1, width, label='Optimized Ensemble', 
                color=COLORS['success'], alpha=0.8)

# Add value labels
for i, (b1, b2) in enumerate(zip(baseline_f1, ensemble_f1)):
    ax.text(b1 + 0.01, i - width/2, f'{b1:.2%}', va='center', fontsize=10)
    ax.text(b2 + 0.01, i + width/2, f'{b2:.2%}', va='center', fontsize=10, fontweight='bold')
    
    # Show improvement
    if b1 > 0:
        improvement = ((b2 - b1) / b1) * 100
        ax.text(0.85, i, f'+{improvement:.1f}%', va='center', fontsize=9, 
                color=COLORS['accent'], fontweight='bold')

ax.set_xlabel('F1 Score', fontsize=14)
ax.set_title('Model Performance: Baseline vs Optimized Ensemble', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_yticks(x)
ax.set_yticklabels([s.replace('has_', '').replace('_', ' ').title() for s in smells])
ax.legend(loc='lower right', fontsize=12)
ax.set_xlim(0, 1)
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/portfolio/02_model_performance.png', bbox_inches='tight', dpi=300)
print("‚úÖ Saved: 02_model_performance.png")
plt.show()

## Visualization 3: Optimization Journey

In [None]:
optimization_data = {
    'Method': ['Baseline\nXGBoost', 'XGBoost\n+ SMOTE', 'XGBoost\nTuned', 
               'Random\nForest', 'Voting\nEnsemble'],
    'F1_Score': [0.9533, 0.9515, 0.9515, 0.9515, 0.9600],
    'Day': [5, 6, 6, 6, 6]
}

fig, ax = plt.subplots(figsize=(12, 7))

x = range(len(optimization_data['Method']))
y = optimization_data['F1_Score']

# Line plot with markers
ax.plot(x, y, marker='o', linewidth=3, markersize=12, 
        color=COLORS['primary'], markerfacecolor=COLORS['accent'], markeredgewidth=2)

# Fill area under curve
ax.fill_between(x, y, alpha=0.2, color=COLORS['primary'])

# Add value labels
for i, (method, score) in enumerate(zip(optimization_data['Method'], y)):
    ax.text(i, score + 0.002, f'{score:.2%}', ha='center', fontsize=11, fontweight='bold')
    
    # Show improvement from baseline
    if i > 0:
        improvement = ((score - optimization_data['F1_Score'][0]) / optimization_data['F1_Score'][0]) * 100
        ax.text(i, score - 0.005, f'{improvement:+.2f}%', ha='center', fontsize=9, 
                color=COLORS['success'], style='italic')

ax.set_xticks(x)
ax.set_xticklabels(optimization_data['Method'], fontsize=12)
ax.set_ylabel('Average F1 Score', fontsize=14)
ax.set_title('Baseline Optimization Journey (Days 5-6)', fontsize=16, fontweight='bold', pad=20)
ax.set_ylim(0.94, 0.97)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/portfolio/03_optimization_journey.png', bbox_inches='tight', dpi=300)
print("‚úÖ Saved: 03_optimization_journey.png")
plt.show()

## Visualization 4: Feature Importance Heatmap

In [None]:
# Get feature importance from all models
feature_importance_data = []

for smell in code_smells:
    try:
        model = joblib.load(f'../models/ensemble/{smell}_voting.pkl')
        
        # Get XGBoost component (first estimator in voting ensemble)
        xgb_model = model.estimators_[0]
        
        feature_names = loader.get_feature_names()
        importances = xgb_model.feature_importances_
        
        # Top 10 features
        top_indices = np.argsort(importances)[-10:]
        
        for idx in top_indices:
            feature_importance_data.append({
                'smell': smell.replace('has_', '').replace('_', ' ').title(),
                'feature': feature_names[idx].replace('_', ' ').title(),
                'importance': importances[idx]
            })
    except:
        pass

importance_df = pd.DataFrame(feature_importance_data)

# Pivot for heatmap
heatmap_data = importance_df.pivot_table(
    index='feature', 
    columns='smell', 
    values='importance', 
    fill_value=0
)

fig, ax = plt.subplots(figsize=(12, 10))
sns.heatmap(heatmap_data, annot=True, fmt='.3f', cmap='YlOrRd', 
            cbar_kws={'label': 'Feature Importance'}, ax=ax)
ax.set_title('Feature Importance Heatmap Across All Code Smells', 
             fontsize=16, fontweight='bold', pad=20)
ax.set_xlabel('Code Smell', fontsize=14)
ax.set_ylabel('Feature', fontsize=14)

plt.tight_layout()
plt.savefig('../results/portfolio/04_feature_importance_heatmap.png', bbox_inches='tight', dpi=300)
print("‚úÖ Saved: 04_feature_importance_heatmap.png")
plt.show()

## Visualization 5: Confusion Matrices Grid

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Confusion Matrices: Optimized Ensemble Models', 
             fontsize=18, fontweight='bold')

for idx, smell in enumerate(code_smells):
    row = idx // 3
    col = idx % 3
    ax = axes[row, col]
    
    try:
        # Load model and data
        model = joblib.load(f'../models/ensemble/{smell}_voting.pkl')
        X_val, _, _, y_val, _, _ = loader.prepare_data_for_smell(
            train_df, val_df, test_df, smell
        )
        
        y_pred = model.predict(X_val)
        cm = confusion_matrix(y_val, y_pred)
        
        # Plot
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                    cbar=False, square=True)
        
        smell_title = smell.replace('has_', '').replace('_', ' ').title()
        ax.set_title(smell_title, fontsize=14, fontweight='bold')
        ax.set_xlabel('Predicted', fontsize=12)
        ax.set_ylabel('Actual', fontsize=12)
        ax.set_xticklabels(['Clean', 'Issue'])
        ax.set_yticklabels(['Clean', 'Issue'])
    except:
        ax.text(0.5, 0.5, 'Model not found', ha='center', va='center')
        ax.axis('off')

# Hide extra subplot
axes[1, 2].axis('off')

plt.tight_layout()
plt.savefig('../results/portfolio/05_confusion_matrices.png', bbox_inches='tight', dpi=300)
print("‚úÖ Saved: 05_confusion_matrices.png")
plt.show()

## Visualization 6: Data Pipeline Overview

In [None]:
# Create a pipeline visualization
fig, ax = plt.subplots(figsize=(16, 8))
ax.axis('off')

# Pipeline stages
stages = [
    {'name': 'Data\nCollection', 'value': '3,647', 'desc': 'Functions', 'y': 0.7},
    {'name': 'Automated\nLabeling', 'value': '1,624', 'desc': 'Clean', 'y': 0.7},
    {'name': 'Feature\nEngineering', 'value': '37', 'desc': 'Features', 'y': 0.7},
    {'name': 'Train/Val/Test\nSplit', 'value': '70/15/15', 'desc': 'Split', 'y': 0.7},
    {'name': 'Model\nTraining', 'value': '35+', 'desc': 'Experiments', 'y': 0.7},
    {'name': 'Best\nModel', 'value': '96.15%', 'desc': 'F1 Score', 'y': 0.7}
]

x_positions = np.linspace(0.1, 0.9, len(stages))

for i, (stage, x) in enumerate(zip(stages, x_positions)):
    # Draw box
    box_color = COLORS['primary'] if i < len(stages)-1 else COLORS['success']
    rect = plt.Rectangle((x-0.06, stage['y']-0.15), 0.12, 0.3, 
                         facecolor=box_color, alpha=0.3, edgecolor=box_color, linewidth=2)
    ax.add_patch(rect)
    
    # Add text
    ax.text(x, stage['y']+0.1, stage['name'], ha='center', va='center', 
            fontsize=11, fontweight='bold')
    ax.text(x, stage['y'], stage['value'], ha='center', va='center', 
            fontsize=16, fontweight='bold', color=box_color)
    ax.text(x, stage['y']-0.08, stage['desc'], ha='center', va='center', 
            fontsize=9, style='italic')
    
    # Draw arrow to next stage
    if i < len(stages) - 1:
        ax.annotate('', xy=(x_positions[i+1]-0.07, stage['y']), 
                   xytext=(x+0.07, stage['y']),
                   arrowprops=dict(arrowstyle='->', lw=2, color='gray'))

ax.set_xlim(0, 1)
ax.set_ylim(0.4, 1)
ax.set_title('CodeGuard ML Pipeline: From Data to Deployment-Ready Models', 
            fontsize=18, fontweight='bold', pad=20)

plt.tight_layout()
plt.savefig('../results/portfolio/06_data_pipeline.png', bbox_inches='tight', dpi=300)
print("‚úÖ Saved: 06_data_pipeline.png")
plt.show()

## Summary

In [None]:
print("\n" + "="*70)
print("‚úÖ All portfolio visualizations created!")
print("="*70)
print("\nüìÅ Saved to: results/portfolio/")
print("\nGenerated Visualizations:")
print("  1. Project Overview Infographic")
print("  2. Model Performance Comparison")
print("  3. Optimization Journey")
print("  4. Feature Importance Heatmap")
print("  5. Confusion Matrices Grid")
print("  6. Data Pipeline Overview")
print("\nüé® All images are 300 DPI, publication-quality")
print("üìä Ready for portfolio, presentations, and resume")