# 📊 Weekly Data Quality Report Generator

## 🎯 Overview
This notebook automates the generation of weekly data quality reports for financial transaction data. It provides comprehensive analysis, visualizations, and actionable insights for data governance teams.

**Features:**
- 📈 Automated weekly data quality scoring
- 📊 Interactive visualizations and dashboards
- 📋 Executive summary reports
- 📄 Multi-format report generation (Excel, HTML, PDF)
- 🚨 Alert system for quality issues
- 📈 Trend analysis over time

**Author:** Your Name  
**Date:** July 2025  
**Version:** 1.0  

## 📅 Report Schedule
- **Frequency:** Weekly (every Monday)
- **Coverage:** Previous 7 days of transaction data
- **Distribution:** Data governance team, management, stakeholders

In [None]:
# 📚 Import Required Libraries and Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as pyo
import json
import os
import sys
from datetime import datetime, timedelta, date
import warnings
warnings.filterwarnings('ignore')

# Add parent directory to path
sys.path.append('../')
from src.data_quality_framework import DataQualityFramework
from src.report_generator import DataQualityReportGenerator
from src.utils import load_config

# Set up plotting configuration
plt.style.use('seaborn-v0_8')
sns.set_palette("Set2")
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Initialize plotly for offline use
pyo.init_notebook_mode(connected=True)

print("✅ Libraries imported successfully!")
print(f"📅 Report Generation Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"📊 Weekly Report Period: {date.today() - timedelta(days=7)} to {date.today()}")

In [None]:
# ⚙️ Load Configuration and Initialize Framework
print("⚙️ Loading Configuration...")

# Initialize Data Quality Framework
framework = DataQualityFramework()
report_generator = DataQualityReportGenerator()

# Load configuration
config = load_config("../config/data_quality_config.json")
print("✅ Configuration loaded successfully")

# Define report period
today = date.today()
week_start = today - timedelta(days=today.weekday() + 7)  # Last Monday
week_end = week_start + timedelta(days=6)  # Last Sunday

print(f"📅 Report Period: {week_start} to {week_end}")
print(f"📊 Data Quality Thresholds:")
print(f"   • Critical: ≥{config.get('data_quality_thresholds', {}).get('critical_pass_rate', 0.95)*100:.0f}%")
print(f"   • Warning: ≥{config.get('data_quality_thresholds', {}).get('warning_pass_rate', 0.90)*100:.0f}%")

# Create output directories
os.makedirs("../data/reports", exist_ok=True)
os.makedirs("../data/failed_records", exist_ok=True)

print("📁 Output directories ready")

In [None]:
# 📂 Load Transaction Data and Execute Validation
print("📂 Loading Weekly Transaction Data...")

# Load sample transaction data (in production, this would filter by date range)
data_file = "../data/sample_transactions.csv"

if os.path.exists(data_file):
    df = pd.read_csv(data_file)
    print(f"✅ Loaded {len(df):,} transactions")
    
    # In production, filter by date range:
    # df['timestamp'] = pd.to_datetime(df['timestamp'])
    # df = df[(df['timestamp'].dt.date >= week_start) & (df['timestamp'].dt.date <= week_end)]
    
else:
    print("❌ Sample data not found. Please run the main data quality script first.")
    raise FileNotFoundError("Sample data file not found")

# Display data overview
print(f"\n📊 Data Overview:")
print(f"   • Records: {len(df):,}")
print(f"   • Date Range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"   • Columns: {list(df.columns)}")
print(f"   • Total Transaction Value: ${df['amount'].sum():,.2f}")

# Execute comprehensive data quality validation
print(f"\n🔍 Executing Data Quality Validation...")
validation_results = framework.run_all_validations(df)

print(f"✅ Validation completed!")
print(f"📈 Overall Pass Rate: {validation_results.get('overall_pass_rate', 0):.2%}")
print(f"🏆 Quality Status: {framework.summary_stats.get('quality_status', 'UNKNOWN')}")

In [None]:
# 📊 Create Interactive Data Quality Dashboard
print("📊 Creating Interactive Data Quality Dashboard...")

# Prepare data for visualization
summary_df = framework.get_validation_summary()
summary_df['Pass_Rate_Numeric'] = summary_df['Pass_Rate'].str.rstrip('%').astype(float)

# Create comprehensive dashboard
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=('Pass Rates by Validation Check', 'Failed Records Distribution', 
                   'Quality Score Gauge', 'Trend Analysis'),
    specs=[[{"type": "bar"}, {"type": "pie"}],
           [{"type": "indicator"}, {"type": "scatter"}]]
)

# 1. Pass Rates Bar Chart
colors = ['green' if rate >= 95 else 'orange' if rate >= 90 else 'red' 
          for rate in summary_df['Pass_Rate_Numeric']]

fig.add_trace(
    go.Bar(
        x=summary_df['Validation_Check'],
        y=summary_df['Pass_Rate_Numeric'],
        name='Pass Rate (%)',
        marker_color=colors,
        text=summary_df['Pass_Rate'],
        textposition='outside'
    ),
    row=1, col=1
)

# 2. Failed Records Pie Chart
failed_data = summary_df[summary_df['Failed_Count'] > 0]
if not failed_data.empty:
    fig.add_trace(
        go.Pie(
            labels=failed_data['Validation_Check'],
            values=failed_data['Failed_Count'],
            name="Failed Records"
        ),
        row=1, col=2
    )

# 3. Overall Quality Score Gauge
overall_score = validation_results.get('overall_pass_rate', 0) * 100
fig.add_trace(
    go.Indicator(
        mode = "gauge+number+delta",
        value = overall_score,
        domain = {'x': [0, 1], 'y': [0, 1]},
        title = {'text': "Quality Score"},
        delta = {'reference': 95},
        gauge = {
            'axis': {'range': [None, 100]},
            'bar': {'color': "darkblue"},
            'steps': [
                {'range': [0, 90], 'color': "lightgray"},
                {'range': [90, 95], 'color': "yellow"},
                {'range': [95, 100], 'color': "lightgreen"}
            ],
            'threshold': {
                'line': {'color': "red", 'width': 4},
                'thickness': 0.75,
                'value': 95
            }
        }
    ),
    row=2, col=1
)

# 4. Mock Trend Analysis (would be real historical data in production)
dates = pd.date_range(start=week_start - timedelta(days=21), end=week_end, freq='D')
mock_scores = np.random.normal(overall_score, 5, len(dates))
mock_scores = np.clip(mock_scores, 80, 100)  # Keep realistic range

fig.add_trace(
    go.Scatter(
        x=dates,
        y=mock_scores,
        mode='lines+markers',
        name='Daily Quality Score',
        line=dict(color='blue', width=2)
    ),
    row=2, col=2
)

# Update layout
fig.update_layout(
    height=800,
    title_text=f"📊 Weekly Data Quality Dashboard - {week_start} to {week_end}",
    title_x=0.5,
    showlegend=False
)

# Update axes
fig.update_xaxes(title_text="Validation Checks", row=1, col=1)
fig.update_yaxes(title_text="Pass Rate (%)", range=[0, 100], row=1, col=1)
fig.update_xaxes(title_text="Date", row=2, col=2)
fig.update_yaxes(title_text="Quality Score", range=[80, 100], row=2, col=2)

fig.show()

# Display summary table
print(f"\n📋 Weekly Data Quality Summary:")
display(summary_df[['Validation_Check', 'Total_Records', 'Passed_Count', 'Failed_Count', 'Pass_Rate', 'Status']])

In [None]:
# 📋 Generate Executive Summary and Key Insights
print("📋 Generating Executive Summary...")

# Calculate key metrics
total_records = validation_results.get('total_input_records', len(df))
passed_records = validation_results.get('total_passed_records', 0)
failed_records = validation_results.get('total_failed_records', 0)
overall_pass_rate = validation_results.get('overall_pass_rate', 0)
quality_status = framework.summary_stats.get('quality_status', 'UNKNOWN')

# Executive summary data
executive_summary = {
    '📅 Report Period': f"{week_start.strftime('%B %d, %Y')} - {week_end.strftime('%B %d, %Y')}",
    '📊 Total Records Processed': f"{total_records:,}",
    '✅ Records Passed': f"{passed_records:,}",
    '❌ Records Failed': f"{failed_records:,}",
    '📈 Overall Pass Rate': f"{overall_pass_rate:.2%}",
    '🏆 Quality Status': quality_status,
    '💰 Total Transaction Value': f"${df['amount'].sum():,.2f}",
    '🌍 Currencies Processed': f"{df['currency'].nunique()} unique currencies",
    '🏦 Accounts Involved': f"{df['account_id'].nunique():,} unique accounts"
}

print("🎯 EXECUTIVE SUMMARY")
print("=" * 50)
for key, value in executive_summary.items():
    print(f"{key}: {value}")

# Generate insights and recommendations
insights = []
recommendations = []

# Analyze each validation check
for _, row in summary_df.iterrows():
    check_name = row['Validation_Check']
    pass_rate = row['Pass_Rate_Numeric']
    failed_count = row['Failed_Count']
    
    if pass_rate < 90:
        insights.append(f"🔴 {check_name} shows critical issues with {100-pass_rate:.1f}% failure rate")
        recommendations.append(f"URGENT: Address {check_name.lower()} issues immediately")
    elif pass_rate < 95:
        insights.append(f"🟡 {check_name} needs attention with {100-pass_rate:.1f}% failure rate")
        recommendations.append(f"Monitor {check_name.lower()} closely and implement improvements")
    else:
        insights.append(f"🟢 {check_name} performing excellently with {pass_rate:.1f}% pass rate")

# Data volume insights
if total_records > 1000:
    insights.append(f"📈 High data volume processed: {total_records:,} records")
else:
    insights.append(f"📊 Standard data volume: {total_records:,} records")

# Currency insights
unique_currencies = df['currency'].nunique()
if unique_currencies > 10:
    insights.append(f"🌍 High currency diversity: {unique_currencies} different currencies")
    recommendations.append("Review currency validation rules for completeness")

print(f"\n💡 KEY INSIGHTS:")
print("-" * 30)
for i, insight in enumerate(insights, 1):
    print(f"{i}. {insight}")

print(f"\n🎯 RECOMMENDATIONS:")
print("-" * 30)
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. {rec}")

# Quality trend assessment (mock data - would be real historical in production)
print(f"\n📈 QUALITY TRENDS:")
print("-" * 30)
if overall_pass_rate >= 0.95:
    print("🟢 Quality trend: EXCELLENT - Maintain current standards")
elif overall_pass_rate >= 0.90:
    print("🟡 Quality trend: WARNING - Implement improvement plan")
else:
    print("🔴 Quality trend: CRITICAL - Immediate action required")

# Risk assessment
risk_level = "LOW" if overall_pass_rate >= 0.95 else "MEDIUM" if overall_pass_rate >= 0.90 else "HIGH"
print(f"⚠️ Risk Level: {risk_level}")

if risk_level == "HIGH":
    print("🚨 ALERT: Data quality issues may impact business operations")
elif risk_level == "MEDIUM":
    print("⚠️ CAUTION: Monitor data quality closely to prevent issues")

In [None]:
# 🔍 Failed Records Analysis and Pattern Detection
print("🔍 Analyzing Failed Records Patterns...")

# Save failed records to files
framework.save_failed_records()

# Analyze patterns in failed records
failed_analysis = {}
total_failed_records = 0

print(f"\n📊 FAILED RECORDS ANALYSIS:")
print("=" * 40)

if framework.failed_records:
    for check_name, failed_df in framework.failed_records.items():
        if len(failed_df) > 0:
            total_failed_records += len(failed_df)
            
            print(f"\n🔍 {check_name.replace('_', ' ').title()}:")
            print(f"   • Failed Records: {len(failed_df):,}")
            print(f"   • Percentage of Total: {len(failed_df)/total_records*100:.2f}%")
            
            # Pattern analysis
            patterns = {}
            
            if check_name == 'currency_codes' and 'currency' in failed_df.columns:
                invalid_currencies = failed_df['currency'].value_counts().head(5)
                patterns['Top Invalid Currencies'] = invalid_currencies.to_dict()
                
            elif check_name == 'amount_range' and 'amount' in failed_df.columns:
                patterns['Amount Statistics'] = {
                    'Min': failed_df['amount'].min(),
                    'Max': failed_df['amount'].max(),
                    'Mean': failed_df['amount'].mean()
                }
                
            elif check_name == 'duplicate_transactions' and 'transaction_id' in failed_df.columns:
                duplicate_counts = failed_df['transaction_id'].value_counts()
                patterns['Most Duplicated IDs'] = duplicate_counts.head(3).to_dict()
                
            # Display patterns
            if patterns:
                print(f"   📈 Patterns Detected:")
                for pattern_name, pattern_data in patterns.items():
                    print(f"      • {pattern_name}: {pattern_data}")
            
            failed_analysis[check_name] = {
                'count': len(failed_df),
                'percentage': len(failed_df)/total_records*100,
                'patterns': patterns
            }

else:
    print("🎉 No failed records found! All data passed validation.")

# Create failed records visualization if there are failures
if total_failed_records > 0:
    print(f"\n📊 Creating Failed Records Visualization...")
    
    # Prepare data for visualization
    check_names = list(failed_analysis.keys())
    failure_counts = [failed_analysis[check]['count'] for check in check_names]
    
    # Create bar chart of failures
    fig_failures = go.Figure()
    
    fig_failures.add_trace(go.Bar(
        x=[name.replace('_', ' ').title() for name in check_names],
        y=failure_counts,
        marker_color='red',
        text=failure_counts,
        textposition='outside'
    ))
    
    fig_failures.update_layout(
        title="Failed Records by Validation Check",
        xaxis_title="Validation Check",
        yaxis_title="Number of Failed Records",
        height=400
    )
    
    fig_failures.show()
    
    # Time-based analysis (if timestamp available)
    if 'timestamp' in df.columns:
        print(f"\n⏰ Time-based Failure Analysis:")
        
        # Convert timestamp to datetime
        df['timestamp_dt'] = pd.to_datetime(df['timestamp'])
        df['hour'] = df['timestamp_dt'].dt.hour
        df['day_of_week'] = df['timestamp_dt'].dt.day_name()
        
        # Analyze failures by hour
        hourly_analysis = df.groupby('hour').size()
        print(f"   • Peak transaction hour: {hourly_analysis.idxmax()}:00 ({hourly_analysis.max()} transactions)")
        
        # Analyze by day of week
        daily_analysis = df.groupby('day_of_week').size()
        print(f"   • Busiest day: {daily_analysis.idxmax()} ({daily_analysis.max()} transactions)")

print(f"\n💾 Failed Records Storage:")
print(f"   • Location: ../data/failed_records/")
print(f"   • Total Failed Records: {total_failed_records:,}")
print(f"   • Files Generated: {len(framework.failed_records) if framework.failed_records else 0}")

# Data quality score calculation details
print(f"\n🏆 QUALITY SCORE BREAKDOWN:")
print("-" * 35)
print(f"Base Score: 100%")
for check_name, result in framework.validation_results.items():
    deduction = (1 - result['pass_rate']) * 100
    print(f"{check_name.replace('_', ' ').title()}: -{deduction:.1f}% (Pass Rate: {result['pass_rate']:.1%})")

print(f"Final Score: {overall_pass_rate:.1%}")

In [None]:
# 📄 Generate and Export Weekly Reports
print("📄 Generating Automated Weekly Reports...")

# Generate comprehensive reports using the report generator
try:
    report_files = report_generator.generate_weekly_report(
        validation_summary=summary_df,
        overall_stats=framework.summary_stats,
        failed_records=framework.failed_records
    )
    
    print("✅ Reports generated successfully!")
    
except Exception as e:
    print(f"⚠️ Report generation failed: {e}")
    report_files = {}

# Create additional formatted reports for stakeholders
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
week_str = f"{week_start.strftime('%Y%m%d')}_{week_end.strftime('%Y%m%d')}"

# 1. Executive Summary Report (Text format)
exec_report_file = f"../data/reports/Executive_Summary_{week_str}.txt"
try:
    with open(exec_report_file, 'w') as f:
        f.write("=" * 60 + "\n")
        f.write("WEEKLY DATA QUALITY REPORT - EXECUTIVE SUMMARY\n")
        f.write("=" * 60 + "\n\n")
        
        for key, value in executive_summary.items():
            f.write(f"{key}: {value}\n")
        
        f.write(f"\nKEY INSIGHTS:\n")
        f.write("-" * 20 + "\n")
        for i, insight in enumerate(insights, 1):
            f.write(f"{i}. {insight}\n")
        
        f.write(f"\nRECOMMENDATIONS:\n")
        f.write("-" * 20 + "\n")
        for i, rec in enumerate(recommendations, 1):
            f.write(f"{i}. {rec}\n")
        
        f.write(f"\nQUALITY STATUS: {quality_status}\n")
        f.write(f"RISK LEVEL: {risk_level}\n")
        
        f.write("\n" + "=" * 60 + "\n")
        f.write(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("Data Quality Framework v1.0\n")
    
    print(f"✅ Executive summary saved: {exec_report_file}")
    
except Exception as e:
    print(f"⚠️ Could not save executive summary: {e}")

# 2. Create Stakeholder Dashboard (HTML)
dashboard_file = f"../data/reports/Stakeholder_Dashboard_{week_str}.html"
try:
    dashboard_html = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <title>Weekly Data Quality Dashboard</title>
        <style>
            body {{ font-family: 'Arial', sans-serif; margin: 20px; background-color: #f5f5f5; }}
            .container {{ max-width: 1200px; margin: 0 auto; background: white; padding: 20px; border-radius: 10px; box-shadow: 0 0 10px rgba(0,0,0,0.1); }}
            .header {{ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px; }}
            .metric-card {{ background: #f8f9fa; border-left: 4px solid #007bff; padding: 15px; margin: 10px 0; border-radius: 5px; }}
            .excellent {{ border-left-color: #28a745; }}
            .warning {{ border-left-color: #ffc107; }}
            .critical {{ border-left-color: #dc3545; }}
            .grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin: 20px 0; }}
            table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
            th, td {{ border: 1px solid #ddd; padding: 12px; text-align: left; }}
            th {{ background-color: #f2f2f2; font-weight: bold; }}
            .status-excellent {{ background-color: #d4edda; color: #155724; }}
            .status-warning {{ background-color: #fff3cd; color: #856404; }}
            .status-critical {{ background-color: #f8d7da; color: #721c24; }}
        </style>
    </head>
    <body>
        <div class="container">
            <div class="header">
                <h1>📊 Weekly Data Quality Dashboard</h1>
                <h3>Report Period: {week_start.strftime('%B %d, %Y')} - {week_end.strftime('%B %d, %Y')}</h3>
                <p>Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
            </div>
            
            <div class="grid">
                <div class="metric-card excellent">
                    <h3>📈 Overall Quality Score</h3>
                    <h2>{overall_pass_rate:.1%}</h2>
                    <p>Status: {quality_status}</p>
                </div>
                <div class="metric-card">
                    <h3>📊 Records Processed</h3>
                    <h2>{total_records:,}</h2>
                    <p>✅ Passed: {passed_records:,} | ❌ Failed: {failed_records:,}</p>
                </div>
                <div class="metric-card">
                    <h3>💰 Transaction Value</h3>
                    <h2>${df['amount'].sum():,.2f}</h2>
                    <p>Average: ${df['amount'].mean():.2f}</p>
                </div>
                <div class="metric-card {'critical' if risk_level == 'HIGH' else 'warning' if risk_level == 'MEDIUM' else 'excellent'}">
                    <h3>⚠️ Risk Level</h3>
                    <h2>{risk_level}</h2>
                    <p>Quality threshold monitoring</p>
                </div>
            </div>
            
            <h2>📋 Detailed Validation Results</h2>
            <table>
                <tr>
                    <th>Validation Check</th>
                    <th>Total Records</th>
                    <th>Passed</th>
                    <th>Failed</th>
                    <th>Pass Rate</th>
                    <th>Status</th>
                </tr>
    """
    
    for _, row in summary_df.iterrows():
        status_class = 'excellent' if '✅' in str(row['Status']) else 'warning' if '⚠️' in str(row['Status']) else 'critical'
        dashboard_html += f"""
                <tr>
                    <td>{row['Validation_Check']}</td>
                    <td>{row['Total_Records']:,}</td>
                    <td>{row['Passed_Count']:,}</td>
                    <td>{row['Failed_Count']:,}</td>
                    <td>{row['Pass_Rate']}</td>
                    <td class="status-{status_class}">{row['Status']}</td>
                </tr>
        """
    
    dashboard_html += f"""
            </table>
            
            <h2>💡 Key Recommendations</h2>
            <ul>
    """
    
    for rec in recommendations:
        dashboard_html += f"<li>{rec}</li>"
    
    dashboard_html += f"""
            </ul>
            
            <div style="margin-top: 40px; padding: 20px; background-color: #e9ecef; border-radius: 5px;">
                <p><strong>Next Review:</strong> {(week_end + timedelta(days=7)).strftime('%B %d, %Y')}</p>
                <p><strong>Contact:</strong> Data Quality Team</p>
                <p><strong>Framework:</strong> Data Quality Framework v1.0</p>
            </div>
        </div>
    </body>
    </html>
    """
    
    with open(dashboard_file, 'w', encoding='utf-8') as f:
        f.write(dashboard_html)
    
    print(f"✅ Stakeholder dashboard saved: {dashboard_file}")
    
except Exception as e:
    print(f"⚠️ Could not create dashboard: {e}")

# 3. Generate CSV summary for data analysis
csv_summary_file = f"../data/reports/Weekly_Summary_{week_str}.csv"
try:
    # Create comprehensive summary DataFrame
    summary_export = summary_df.copy()
    summary_export['Report_Week'] = f"{week_start} to {week_end}"
    summary_export['Generated_At'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    summary_export['Overall_Pass_Rate'] = f"{overall_pass_rate:.2%}"
    summary_export['Quality_Status'] = quality_status
    summary_export['Risk_Level'] = risk_level
    
    summary_export.to_csv(csv_summary_file, index=False)
    print(f"✅ CSV summary saved: {csv_summary_file}")
    
except Exception as e:
    print(f"⚠️ Could not save CSV summary: {e}")

print(f"\n📁 All Reports Saved To: ../data/reports/")
print(f"📄 Report Formats: Excel, HTML, CSV, TXT")
print(f"📊 Dashboard Available: {dashboard_file}")

# Summary of generated files
print(f"\n📋 GENERATED FILES SUMMARY:")
print("-" * 35)
print(f"1. Executive Summary: Executive_Summary_{week_str}.txt")
print(f"2. Stakeholder Dashboard: Stakeholder_Dashboard_{week_str}.html")
print(f"3. CSV Summary: Weekly_Summary_{week_str}.csv")
if report_files:
    for file_type, file_path in report_files.items():
        print(f"4. {file_type}: {os.path.basename(file_path) if file_path else 'Not generated'}")

print(f"\n🎯 NEXT STEPS:")
print("1. Review failed records in ../data/failed_records/")
print("2. Share dashboard with stakeholders")
print("3. Implement recommended actions")
print("4. Schedule next week's report generation")
print("5. Monitor quality trends")

print(f"\n✅ Weekly Report Generation Completed Successfully! ✅")