In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

# Define paths
PROCESSED_DIR = Path("../data/processed")

print(f"Processed data directory: {PROCESSED_DIR.absolute()}")

## 1. Load Processed Data

In [None]:
# Load processed datasets
def load_processed_data():
    """Load all available processed datasets"""
    datasets = {}
    
    # Try to load integrated dataset
    integrated_file = PROCESSED_DIR / "integrated_dataset.parquet"
    if integrated_file.exists():
        try:
            datasets['integrated'] = pd.read_parquet(integrated_file)
            print(f"Loaded integrated dataset: {len(datasets['integrated'])} records")
        except Exception as e:
            print(f"Error loading integrated dataset: {e}")
    
    # Try to load individual processed datasets
    processed_files = list(PROCESSED_DIR.glob("*.parquet"))
    for file_path in processed_files:
        if 'integrated' not in str(file_path):
            try:
                name = file_path.stem.replace('_processed', '')
                datasets[name] = pd.read_parquet(file_path)
                print(f"Loaded {name} dataset: {len(datasets[name])} records")
            except Exception as e:
                print(f"Error loading {file_path.name}: {e}")
    
    return datasets

# Load data
data = load_processed_data()
print(f"\nLoaded {len(data)} datasets: {list(data.keys())}")

# Display sample of each dataset
for name, df in data.items():
    print(f"\n{name.upper()} DATASET:")
    display(df.head())
    print(f"Columns: {df.columns.tolist()}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")

## 2. Weather Data Visualizations

In [None]:
# Weather data visualizations
if 'weather' in data:
    weather_df = data['weather']
    
    # Temperature over time
    fig = px.line(weather_df, x='date', y='temperature', color='city',
                  title='Temperature Trends by City',
                  labels={'temperature': 'Temperature (°C)', 'date': 'Date'})
    fig.show()
    
    # Temperature distribution
    fig = px.histogram(weather_df, x='temperature', color='city',
                       title='Temperature Distribution by City',
                       marginal='box')
    fig.show()
    
    # Weather conditions
    weather_counts = weather_df.groupby(['city', 'weather_main']).size().reset_index(name='count')
    fig = px.bar(weather_counts, x='city', y='count', color='weather_main',
                 title='Weather Conditions by City',
                 labels={'count': 'Frequency', 'weather_main': 'Weather Condition'})
    fig.show()
    
    # Temperature vs Humidity scatter
    fig = px.scatter(weather_df, x='temperature', y='humidity', color='city',
                     title='Temperature vs Humidity',
                     labels={'temperature': 'Temperature (°C)', 'humidity': 'Humidity (%)'})
    fig.show()
else:
    print("Weather data not available for visualization")

In [None]:
# Weather dashboard prototype
if 'weather' in data:
    weather_df = data['weather']
    
    # Create subplot figure
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Temperature Trends', 'Humidity Distribution', 
                       'Weather Conditions', 'Temperature vs Humidity'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    # Temperature trends
    for city in weather_df['city'].unique():
        city_data = weather_df[weather_df['city'] == city]
        fig.add_trace(
            go.Scatter(x=city_data['date'], y=city_data['temperature'], name=f'{city} Temp'),
            row=1, col=1
        )
    
    # Humidity distribution
    fig.add_trace(
        go.Histogram(x=weather_df['humidity'], name='Humidity'),
        row=1, col=2
    )
    
    # Weather conditions
    weather_counts = weather_df['weather_main'].value_counts()
    fig.add_trace(
        go.Bar(x=weather_counts.index, y=weather_counts.values, name='Weather Conditions'),
        row=2, col=1
    )
    
    # Temperature vs Humidity
    fig.add_trace(
        go.Scatter(x=weather_df['temperature'], y=weather_df['humidity'], 
                  mode='markers', name='Temp vs Humidity'),
        row=2, col=2
    )
    
    fig.update_layout(height=800, title_text="Weather Dashboard Prototype")
    fig.show()

## 3. Retail Data Visualizations

In [None]:
# Retail data visualizations
if 'retail' in data:
    retail_df = data['retail']
    
    # Identify numeric columns for visualization
    numeric_cols = retail_df.select_dtypes(include=['number']).columns
    print(f"Numeric columns available: {numeric_cols.tolist()}")
    
    # Time series if date column exists
    if 'date' in retail_df.columns and len(numeric_cols) > 0:
        # Plot first numeric column over time
        fig = px.line(retail_df, x='date', y=numeric_cols[0],
                      title=f'{numeric_cols[0]} Over Time')
        fig.show()
    
    # Distribution plots for numeric columns
    for col in numeric_cols[:3]:  # First 3 numeric columns
        fig = px.histogram(retail_df, x=col, title=f'{col} Distribution',
                          marginal='box')
        fig.show()
    
    # Correlation heatmap if multiple numeric columns
    if len(numeric_cols) > 1:
        corr_matrix = retail_df[numeric_cols].corr()
        fig = px.imshow(corr_matrix, title='Correlation Matrix',
                       labels=dict(color="Correlation"))
        fig.show()
        
else:
    print("Retail data not available for visualization")

## 4. Headlines Data Visualizations

In [None]:
# Headlines data visualizations
if 'headlines' in data:
    headlines_df = data['headlines']
    
    # Headlines over time
    if 'date' in headlines_df.columns:
        daily_counts = headlines_df.groupby('date').size().reset_index(name='count')
        fig = px.line(daily_counts, x='date', y='count',
                      title='Headlines Volume Over Time',
                      labels={'count': 'Number of Headlines'})
        fig.show()
    
    # Source distribution
    if 'source' in headlines_df.columns:
        source_counts = headlines_df['source'].value_counts().head(10)
        fig = px.bar(source_counts, x=source_counts.index, y=source_counts.values,
                     title='Top News Sources',
                     labels={'x': 'Source', 'y': 'Number of Headlines'})
        fig.show()
    
    # Title length distribution
    if 'title_length' in headlines_df.columns:
        fig = px.histogram(headlines_df, x='title_length',
                          title='Headline Title Length Distribution',
                          labels={'title_length': 'Title Length (characters)'})
        fig.show()
        
else:
    print("Headlines data not available for visualization")

## 5. Integrated Data Visualizations

In [None]:
# Integrated data visualizations
if 'integrated' in data:
    integrated_df = data['integrated']
    
    print(f"Integrated dataset columns: {integrated_df.columns.tolist()}")
    
    # Temperature vs Sales (if both exist)
    temp_cols = [col for col in integrated_df.columns if 'temp' in col.lower()]
    sales_cols = [col for col in integrated_df.columns if any(term in col.lower() for term in ['sales', 'revenue', 'amount'])]
    
    if temp_cols and sales_cols:
        fig = px.scatter(integrated_df, x=temp_cols[0], y=sales_cols[0],
                        title=f'{sales_cols[0]} vs {temp_cols[0]}',
                        trendline='ols')
        fig.show()
    
    # Multi-variable time series
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    # Add temperature
    if temp_cols:
        fig.add_trace(
            go.Scatter(x=integrated_df['date'], y=integrated_df[temp_cols[0]], 
                      name=temp_cols[0], line=dict(color='red')),
            secondary_y=False
        )
    
    # Add sales/revenue
    if sales_cols:
        fig.add_trace(
            go.Scatter(x=integrated_df['date'], y=integrated_df[sales_cols[0]], 
                      name=sales_cols[0], line=dict(color='blue')),
            secondary_y=True
        )
    
    fig.update_layout(title_text="Temperature vs Sales Over Time")
    fig.update_xaxes(title_text="Date")
    fig.update_yaxes(title_text="Temperature (°C)", secondary_y=False)
    fig.update_yaxes(title_text="Sales", secondary_y=True)
    fig.show()
    
    # Correlation analysis
    numeric_cols = integrated_df.select_dtypes(include=['number']).columns
    if len(numeric_cols) > 1:
        corr_matrix = integrated_df[numeric_cols].corr()
        fig = px.imshow(corr_matrix, title='Integrated Data Correlation Matrix',
                       labels=dict(color="Correlation"))
        fig.show()
        
else:
    print("Integrated data not available for visualization")

## 6. Dashboard Layout Prototypes

In [None]:
# KPI Cards Prototype
def create_kpi_cards(data_dict):
    """Create KPI summary cards"""
    kpis = {}
    
    for name, df in data_dict.items():
        kpis[name] = {
            'total_records': len(df),
            'date_range': f"{df['date'].min()} to {df['date'].max()}",
            'numeric_columns': len(df.select_dtypes(include=['number']).columns),
            'missing_values': df.isnull().sum().sum()
        }
    
    return kpis

# Create KPI cards
kpi_data = create_kpi_cards(data)

# Display KPIs
for dataset, kpis in kpi_data.items():
    print(f"\n{dataset.upper()} KPIs:")
    for key, value in kpis.items():
        print(f"  {key}: {value}")

In [None]:
# Streamlit-style dashboard layout prototype
def create_dashboard_layout():
    """Prototype dashboard layout with matplotlib/seaborn"""
    
    if not data:
        print("No data available for dashboard")
        return
    
    fig, axes = plt.subplots(3, 2, figsize=(15, 12))
    fig.suptitle('ETL Dashboard Prototype', fontsize=16)
    
    # Row 1: KPIs
    axes[0,0].text(0.5, 0.5, f"Total Records\n{sum(len(df) for df in data.values())}", 
                   transform=axes[0,0].transAxes, ha='center', va='center', fontsize=14)
    axes[0,0].set_title('Total Records')
    axes[0,0].axis('off')
    
    axes[0,1].text(0.5, 0.5, f"Datasets\n{len(data)}", 
                   transform=axes[0,1].transAxes, ha='center', va='center', fontsize=14)
    axes[0,1].set_title('Active Datasets')
    axes[0,1].axis('off')
    
    # Row 2: Weather data
    if 'weather' in data:
        weather_df = data['weather']
        if 'temperature' in weather_df.columns:
            weather_df['temperature'].hist(ax=axes[1,0], bins=20, alpha=0.7)
            axes[1,0].set_title('Temperature Distribution')
            axes[1,0].set_xlabel('Temperature (°C)')
        
        if 'weather_main' in weather_df.columns:
            weather_counts = weather_df['weather_main'].value_counts()
            weather_counts.plot(kind='bar', ax=axes[1,1])
            axes[1,1].set_title('Weather Conditions')
            axes[1,1].tick_params(axis='x', rotation=45)
    else:
        axes[1,0].text(0.5, 0.5, 'Weather data\nnot available', 
                       transform=axes[1,0].transAxes, ha='center', va='center')
        axes[1,1].text(0.5, 0.5, 'Weather data\nnot available', 
                       transform=axes[1,1].transAxes, ha='center', va='center')
    
    # Row 3: Headlines data
    if 'headlines' in data:
        headlines_df = data['headlines']
        if 'source' in headlines_df.columns:
            source_counts = headlines_df['source'].value_counts().head(5)
            source_counts.plot(kind='bar', ax=axes[2,0])
            axes[2,0].set_title('Top News Sources')
            axes[2,0].tick_params(axis='x', rotation=45)
        
        if 'date' in headlines_df.columns:
            daily_counts = headlines_df.groupby('date').size()
            daily_counts.plot(ax=axes[2,1])
            axes[2,1].set_title('Headlines Over Time')
            axes[2,1].tick_params(axis='x', rotation=45)
    else:
        axes[2,0].text(0.5, 0.5, 'Headlines data\nnot available', 
                       transform=axes[2,0].transAxes, ha='center', va='center')
        axes[2,1].text(0.5, 0.5, 'Headlines data\nnot available', 
                       transform=axes[2,1].transAxes, ha='center', va='center')
    
    plt.tight_layout()
    plt.show()

# Create dashboard layout
create_dashboard_layout()

## 7. Interactive Visualizations

In [None]:
# Interactive correlation explorer
def create_correlation_explorer(df, title):
    """Create interactive correlation matrix"""
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    if len(numeric_cols) < 2:
        print(f"Not enough numeric columns in {title} for correlation analysis")
        return
    
    corr_matrix = df[numeric_cols].corr()
    
    fig = px.imshow(corr_matrix,
                    title=f'{title} - Correlation Matrix',
                    labels=dict(color="Correlation"),
                    zmin=-1, zmax=1)
    fig.show()

# Create correlation explorers for each dataset
for name, df in data.items():
    create_correlation_explorer(df, name.title())

In [None]:
# Time series explorer
def create_time_series_explorer(df, name):
    """Create interactive time series visualization"""
    if 'date' not in df.columns:
        print(f"No date column in {name} dataset")
        return
    
    numeric_cols = df.select_dtypes(include=['number']).columns
    
    if len(numeric_cols) == 0:
        print(f"No numeric columns in {name} dataset")
        return
    
    # Create subplot for multiple numeric columns
    fig = make_subplots(rows=len(numeric_cols), cols=1, 
                        subplot_titles=[f'{col} Over Time' for col in numeric_cols],
                        shared_xaxes=True)
    
    for i, col in enumerate(numeric_cols):
        fig.add_trace(
            go.Scatter(x=df['date'], y=df[col], name=col),
            row=i+1, col=1
        )
    
    fig.update_layout(height=300*len(numeric_cols), title_text=f"{name.title()} Time Series")
    fig.show()

# Create time series explorers
for name, df in data.items():
    create_time_series_explorer(df, name)

## 8. Export Visualization Code

In [None]:
# Export visualization functions for dashboard
visualization_code = '''
# Visualization functions for Streamlit dashboard
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def create_weather_dashboard(weather_df):
    """Create weather dashboard visualizations"""
    col1, col2 = st.columns(2)
    
    with col1:
        # Temperature trends
        fig = px.line(weather_df, x='date', y='temperature', color='city',
                      title='Temperature Trends')
        st.plotly_chart(fig)
    
    with col2:
        # Weather conditions
        weather_counts = weather_df['weather_main'].value_counts()
        fig = px.bar(x=weather_counts.index, y=weather_counts.values,
                     title='Weather Conditions')
        st.plotly_chart(fig)

def create_headlines_dashboard(headlines_df):
    """Create headlines dashboard visualizations"""
    col1, col2 = st.columns(2)
    
    with col1:
        # Headlines over time
        daily_counts = headlines_df.groupby('date').size().reset_index(name='count')
        fig = px.line(daily_counts, x='date', y='count',
                      title='Headlines Volume')
        st.plotly_chart(fig)
    
    with col2:
        # Top sources
        source_counts = headlines_df['source'].value_counts().head(10)
        fig = px.bar(x=source_counts.index, y=source_counts.values,
                     title='Top News Sources')
        st.plotly_chart(fig)

def create_integrated_dashboard(integrated_df):
    """Create integrated data dashboard visualizations"""
    # Temperature vs Sales correlation
    temp_cols = [col for col in integrated_df.columns if 'temp' in col.lower()]
    sales_cols = [col for col in integrated_df.columns if any(term in col.lower() for term in ['sales', 'revenue', 'amount'])]
    
    if temp_cols and sales_cols:
        fig = px.scatter(integrated_df, x=temp_cols[0], y=sales_cols[0],
                        title=f'{sales_cols[0]} vs {temp_cols[0]}',
                        trendline='ols')
        st.plotly_chart(fig)
    
    # Correlation matrix
    numeric_cols = integrated_df.select_dtypes(include=['number']).columns
    if len(numeric_cols) > 1:
        corr_matrix = integrated_df[numeric_cols].corr()
        fig = px.imshow(corr_matrix, title='Correlation Matrix')
        st.plotly_chart(fig)
'''

# Save visualization code
viz_file = Path("../scripts/visualization_functions.py")
with open(viz_file, 'w') as f:
    f.write(visualization_code)

print(f"Visualization functions exported to: {viz_file.absolute()}")

## 9. Summary and Recommendations

In [None]:
print("VISUALIZATION PROTOTYPING SUMMARY")
print("=" * 40)

print(f"Datasets analyzed: {len(data)}")
for name, df in data.items():
    print(f"  - {name}: {len(df)} records, {len(df.columns)} columns")

print("\nVISUALIZATION TYPES TESTED:")
print("✓ Line charts (time series)")
print("✓ Bar charts (categorical data)")
print("✓ Histograms (distributions)")
print("✓ Scatter plots (correlations)")
print("✓ Heatmaps (correlation matrices)")
print("✓ Interactive Plotly charts")
print("✓ Multi-panel dashboards")

print("\nDASHBOARD COMPONENTS PROTOTYPED:")
print("✓ KPI summary cards")
print("✓ Weather monitoring section")
print("✓ Retail analytics section")
print("✓ News headlines section")
print("✓ Integrated analysis section")

print("\nRECOMMENDATIONS FOR PRODUCTION:")
print("1. Use Plotly for interactive visualizations in Streamlit")
print("2. Implement caching for expensive computations")
print("3. Add date range filters to all time series charts")
print("4. Include data quality indicators on dashboard")
print("5. Add export functionality for charts")
print("6. Consider using themes/colors consistent with company branding")
print("7. Add tooltips and hover information to charts")
print("8. Implement responsive design for different screen sizes")

print("\nNEXT STEPS:")
print("1. Integrate approved visualizations into dashboard.py")
print("2. Add user interaction features (filters, drill-downs)")
print("3. Test dashboard performance with large datasets")
print("4. Add automated chart updates when data refreshes")
print("5. Document visualization standards and guidelines")