# Report Playbook

This notebook prepares the data for generating a report using Quarto.

In [ ]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the data
data_path = 'data/cleaned_master.csv'
try:
    df = pd.read_csv(data_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: File not found at {data_path}. Please ensure the file exists.")
    df = None
except pd.errors.EmptyDataError:
    print(f"Error: The CSV file at {data_path} is empty.")
    df = None
except pd.errors.ParserError:
    print(f"Error: Could not parse the CSV file at {data_path}. Check the file format.")
    df = None

if df is not None:
    # Check for required columns
    required_columns = ['submitdate', 'reportsent', 'name', 'function', 'company_name', 'email_address', 'version', 'sector', 'subsector', 'size_number_of_employees', 'value_strategy', 'overall_scres']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        print(f"Error: The following required columns are missing: {missing_columns}")
        df = None
    else:
        # Data Type Conversion
        df['size_number_of_employees'] = pd.to_numeric(df['size_number_of_employees'], errors='coerce')
        df['overall_scres'] = pd.to_numeric(df['overall_scres'], errors='coerce')

        # Missing Value Handling
        print("
Missing Values:")
        print(df.isnull().sum())

        # Fill missing values in 'size_number_of_employees' with the median
        df['size_number_of_employees'].fillna(df['size_number_of_employees'].median(), inplace=True)

        # Fill missing values in 'overall_scres' with the mean
        df['overall_scres'].fillna(df['overall_scres'].mean(), inplace=True)

        # Generate Visualizations and save to files
        plt.figure(figsize=(10, 6))
        sns.histplot(df['overall_scres'], kde=True)
        plt.title('Distribution of Overall Scores')
        plt.xlabel('Overall Score')
        plt.ylabel('Frequency')
        plt.savefig('reports/overall_score_distribution.png')
        plt.close()

        plt.figure(figsize=(10, 6))
        sns.barplot(x='sector', y='overall_scres', data=df)
        plt.title('Overall Score by Sector')
        plt.xlabel('Sector')
        plt.ylabel('Overall Score')
        plt.xticks(rotation=45, ha='right')
        plt.tight_layout()
        plt.savefig('reports/sector_score_distribution.png')
        plt.close()

        # Save the processed DataFrame to a CSV file
        df.to_csv('reports/processed_data.csv', index=False)

        print("Data processing and visualization complete. Processed data saved to reports/processed_data.csv")
else:
    print("Data loading failed. Report generation aborted.")


## Next Steps

Use the processed data in `reports/processed_data.csv` to generate a Quarto report.