In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Load all the data files as a csv files
df_benin = pd.read_csv('../data/benin-malanville.csv')
df_sierraleone = pd.read_csv('../data/sierraleone-bumbuna.csv')
df_togo = pd.read_csv('../data/togo-dapaong_qc.csv')


def inspect_data(df):
    """
    Inspect the dataset for initial quality checks.

    Parameters:
    df (pd.DataFrame): The DataFrame to inspect.

    Returns:
    None
    """
    print("DataFrame Head:")
    print(df.head())
    print("\nDataFrame Info:")
    print(df.info())
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nDescriptive Statistics:")
    print(df.describe())

def clean_data(df):
    """
    Clean the dataset by handling missing values and outliers.

    Parameters:
    df (pd.DataFrame): The DataFrame to clean.

    Returns:
    pd.DataFrame: The cleaned DataFrame.
    """
    # Handling missing values (example: filling with mean)
    df.fillna(df.mean(), inplace=True)

    # Handling outliers (example: capping values to the 1st and 99th percentile)
    for column in df.select_dtypes(include=[np.number]).columns:
        lower_bound = df[column].quantile(0.01)
        upper_bound = df[column].quantile(0.99)
        df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
        df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    
    return df

def compare_before_after(df_before, df_after, columns, dataset_name):
    """
    Compare statistical summaries and visualizations of the data before and after cleaning.

    Parameters:
    df_before (pd.DataFrame): The DataFrame before cleaning.
    df_after (pd.DataFrame): The DataFrame after cleaning.
    columns (list): List of columns to compare.
    dataset_name (str): The name of the dataset (for plot titles).

    Returns:
    None
    """
    # Descriptive statistics before and after cleaning
    print(f"\nDescriptive Statistics Before Cleaning ({dataset_name}):")
    print(df_before[columns].describe())
    print(f"\nDescriptive Statistics After Cleaning ({dataset_name}):")
    print(df_after[columns].describe())
    
    # Visualize distributions before and after cleaning
    for column in columns:
        plt.figure(figsize=(14, 6))
        
        plt.subplot(1, 2, 1)
        sns.histplot(df_before[column], bins=30, kde=True)
        plt.title(f'{column} Distribution Before Cleaning - {dataset_name}')
        
        plt.subplot(1, 2, 2)
        sns.histplot(df_after[column], bins=30, kde=True)
        plt.title(f'{column} Distribution After Cleaning - {dataset_name}')
        
        plt.show()

def analyze_data(df_before, dataset_name):
    """
    Analyze the impact of cleaning on the dataset.

    Parameters:
    df_before (pd.DataFrame): The DataFrame before cleaning.
    dataset_name (str): The name of the dataset (for plot titles).

    Returns:
    None
    """
    # Clean the dataset
    df_after = clean_data(df_before.copy())
    
    # Inspect data before and after cleaning
    print(f"\nInspection Before Cleaning - {dataset_name} Dataset:")
    inspect_data(df_before)
    
    print(f"\nInspection After Cleaning - {dataset_name} Dataset:")
    inspect_data(df_after)
    
    # Compare before and after cleaning
    columns_to_analyze = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB']
    compare_before_after(df_before, df_after, columns_to_analyze, dataset_name)

def main():
    
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    
    for name, df in datasets.items():
        analyze_data(df, name)




# Impact of data cleaning

In [2]:
# Run the main function
if __name__ == "__main__":
    main()