# Import Important Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import numpy as np
import os


# Data Loading and Inspection

In [None]:
# Load all the data files as a csv files
df_benin = pd.read_csv('../data/benin-malanville.csv')
df_sierraleone = pd.read_csv('../data/sierraleone-bumbuna.csv')
df_togo = pd.read_csv('../data/togo-dapaong_qc.csv')

# Function to perform EDA on a single dataframe
def inspect_dataframe(df, name):
    print(f"--- Inspection for {name} ---\n")
    
    # Print the first few rows of the dataframe
    print("First 5 rows:")
    print(df.head())
    print("\n")

    # Print the basic info about the dataframe
    print("DataFrame Info:")
    print(df.info())
    print("\n")

    # Print the summary statistics for numerical columns
    print("Summary Statistics:")
    print(df.describe())
    print("\n")

    # Print the number of missing values in each column
    print("Missing Values:")
    print(df.isnull().sum())
    print("\n")
    
    # Print unique values for each column
    print("Unique Values in Each Column:")
    print(df.nunique())
    print("\n")

    # Check for duplicate rows
    print(f"Number of duplicate rows in {name}: {df.duplicated().sum()}")
    print("\n" + "-"*50 + "\n")

In [None]:
inspect_dataframe(df_benin, "df_benin")

In [None]:
inspect_dataframe(df_sierraleone, "df_sierraleone")

In [None]:
inspect_dataframe(df_togo, "df_togo")

# Data quality checks

In [None]:

# Define a function for data quality checks
def data_quality_checks(df, name):
    print(f"--- Data Quality Checks for {name} ---\n")
    
    # Check for missing values
    print("Missing Values:")
    missing_values = df.isnull().sum()
    print(missing_values[missing_values > 0])
    print("\n")
    
    # Check for duplicate rows
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows in {name}: {duplicates}")
    print("\n")
    
    # Check for data types and convert if necessary
    print("Data Types:")
    print(df.dtypes)
    print("\n")
    
    # Check for inconsistent data (e.g., string in numerical columns)
    print("Inconsistent Data Checks:")
    for column in df.select_dtypes(include='object').columns:
        print(f"Unique values in column '{column}':")
        print(df[column].unique())
        print("\n")
    
    # Check for outliers using the interquartile range (IQR) method for numerical columns
    print("Outliers Check:")
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[column] < (Q1 - 1.5 * IQR)) | (df[column] > (Q3 + 1.5 * IQR))]
        print(f"Outliers in column '{column}': {len(outliers)}")
    print("\n")

    # Check for unique values in categorical data
    print("Unique Values in Categorical Columns:")
    for column in df.select_dtypes(include='object').columns:
        unique_values = df[column].nunique()
        print(f"Unique values in column '{column}': {unique_values}")
    print("\n")

    # Check for range and validity of specific columns (custom checks)
    # Example: Ensure numerical columns are within expected ranges
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        if df[column].min() < 0:
            print(f"Warning: Negative values detected in '{column}'")
    
    print("\n" + "-"*50 + "\n")

In [None]:
data_quality_checks(df_benin, "df_benin")

In [None]:
data_quality_checks(df_sierraleone, "df_sierraleone")

In [None]:
data_quality_checks(df_togo, "df_togo")

# Box Plotting

In [None]:
def create_boxplot(df, columns, title):
    """
    Create a box plot for specified columns in a DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    columns (list): A list of column names to include in the box plot.
    title (str): The title of the box plot.
    
    Returns:
    None
    """
    plt.figure(figsize=(10, 6))  # Set figure size for better visualization
    sns.boxplot(data=df[columns])  # Create the box plot with seaborn
    plt.title(title)  # Set the title
    plt.ylabel('Values')  # Set y-axis label
    plt.xlabel('Variables')  # Set x-axis label
    plt.show()  # Display the plot


In [None]:
# Using the function to create a boxplot for df_benin
create_boxplot(
    df_benin, 
    ['GHI', 'DNI', 'DHI', 'ModA', 'ModB'], 
    'GHI, DNI, DHI, ModA, ModB Distribution for Benin'
)


In [None]:
# Using the function to create a boxplot for df_benin
create_boxplot(
    df_sierraleone, 
    ['GHI', 'DNI', 'DHI', 'ModA', 'ModB'], 
    'GHI, DNI, DHI, ModA, ModB Distribution for Benin'
)

In [None]:
# Using the function to create a boxplot for df_benin
create_boxplot(
    df_togo, 
    ['GHI', 'DNI', 'DHI', 'ModA', 'ModB'], 
    'GHI, DNI, DHI, ModA, ModB Distribution for Benin'
)

# Time Series Analysis

In [None]:


def load_data(filepath):
    """
    Load the dataset from a specified file path.

    Parameters:
    filepath (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The loaded DataFrame.
    """
    df = pd.read_csv(filepath)
    return df

def preprocess_data(df, date_column):
    """
    Preprocess the DataFrame for time series analysis.

    Parameters:
    df (pd.DataFrame): The DataFrame to preprocess.
    date_column (str): The name of the column containing date information.

    Returns:
    pd.DataFrame: The preprocessed DataFrame with datetime index.
    """
    df[date_column] = pd.to_datetime(df[date_column])  # Convert to datetime
    df.set_index(date_column, inplace=True)  # Set date column as index
    return df

def plot_time_series(df, columns, title):
    """
    Plot the time series for specified columns.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    columns (list): A list of column names to plot.
    title (str): The title of the plot.

    Returns:
    None
    """
    plt.figure(figsize=(12, 6))
    for column in columns:
        plt.plot(df.index, df[column], label=column)
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('Values')
    plt.legend()
    plt.show()

def decompose_time_series(df, column, model='additive', period=12):
    """
    Perform seasonal decomposition of a time series.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The column to decompose.
    model (str): The type of seasonal component ('additive' or 'multiplicative').
    period (int): The period for seasonal decomposition.

    Returns:
    None
    """
    decomposition = seasonal_decompose(df[column], model=model, period=period)
    decomposition.plot()
    plt.show()

def plot_acf_pacf(df, column, lags=30):
    """
    Plot the Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF).

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column (str): The column to analyze.
    lags (int): The number of lags to include in the plots.

    Returns:
    None
    """
    fig, ax = plt.subplots(2, 1, figsize=(12, 8))
    
    plot_acf(df[column], lags=lags, ax=ax[0])
    ax[0].set_title('Autocorrelation Function (ACF)')
    
    plot_pacf(df[column], lags=lags, ax=ax[1])
    ax[1].set_title('Partial Autocorrelation Function (PACF)')
    
    plt.tight_layout()
    plt.show()


In [None]:
# for benin
df_benin = preprocess_data(df_benin, 'GHI')  

plot_time_series(
    df_benin, 
    ['GHI', 'DNI', 'DHI', 'ModA', 'ModB'], 
    'Time Series Plot for GHI, DNI, DHI, ModA, ModB in Benin'
)

decompose_time_series(df_benin, 'GHI', model='additive', period=12)

plot_acf_pacf(df_benin, 'GHI', lags=30)


In [None]:
# for seierraleone
df_benin = preprocess_data(df_sierraleone, 'GHI')  

plot_time_series(
    df_sierraleone, 
    ['GHI', 'DNI', 'DHI', 'ModA', 'ModB'], 
    'Time Series Plot for GHI, DNI, DHI, ModA, ModB in Benin'
)

decompose_time_series(df_sierraleone, 'GHI', model='additive', period=12)

plot_acf_pacf(df_sierraleone, 'GHI', lags=30)

In [None]:
# for togo
df_benin = preprocess_data(df_togo, 'GHI')  

plot_time_series(
    df_togo, 
    ['GHI', 'DNI', 'DHI', 'ModA', 'ModB'], 
    'Time Series Plot for GHI, DNI, DHI, ModA, ModB in Benin'
)

decompose_time_series(df_togo, 'GHI', model='additive', period=12)

plot_acf_pacf(df_togo, 'GHI', lags=30)

# Correlation Analysis

In [None]:

def plot_correlation_matrix(df, dataset_name):
    """
    Calculate and plot the correlation matrix for a given dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame to analyze.
    dataset_name (str): The name of the dataset (for plot titles).

    Returns:
    None
    """
    # Calculate correlation matrix
    correlation_matrix = df.corr()
    
    # Plot the correlation matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title(f'Correlation Matrix - {dataset_name}')
    plt.show()

def main():
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    # Perform correlation analysis for each dataset
    for name, df in datasets.items():
        print(f"\nCorrelation Analysis for {name} Dataset:")
        plot_correlation_matrix(df, name)


In [None]:
# Run the main function
if __name__ == "__main__":
    main()

# Modular wind analysis function

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def plot_wind_data(df, dataset_name):
    """
    Plot wind speed and direction for a given dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame containing wind data.
    dataset_name (str): The name of the dataset (for plot title).

    Returns:
    None
    """
    # Prepare the wind data
    wind_speed = df['WS']
    wind_direction = df['WD']

    # Convert wind direction from degrees to radians for polar plotting
    wind_direction_rad = np.deg2rad(wind_direction)

    # Create the polar plot
    plt.figure(figsize=(10, 8))
    ax = plt.subplot(111, polar=True)

    # Plot the wind data
    sc = ax.scatter(wind_direction_rad, wind_speed, c=wind_speed, cmap='viridis', alpha=0.75, edgecolors='w')

    # Add a color bar
    cbar = plt.colorbar(sc)
    cbar.set_label('Wind Speed (m/s)')

    # Set labels and title
    ax.set_theta_direction(-1)  # Wind direction clockwise
    ax.set_theta_offset(np.pi / 2.0)  # Wind direction starting from the top (North)
    ax.set_rlabel_position(0)  # Move radial labels away from plotted data
    plt.title(f'Wind Speed and Direction Distribution in {dataset_name}')

    plt.show()

def main():
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    # Perform wind analysis for each dataset
    for name, df in datasets.items():
        if 'WS' in df.columns and 'WD' in df.columns:
            print(f"\nWind Analysis for {name} Dataset:")
            plot_wind_data(df, name)
        else:
            print(f"\nMissing wind data columns in {name} dataset.")




In [None]:
# Run the main function
if __name__ == "__main__":
    main()

# Modular Temperature Analysis function

In [None]:
def plot_temperature_distribution(df, dataset_name):
    """
    Plot the temperature distribution for a given dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame containing temperature data.
    dataset_name (str): The name of the dataset (for plot title).

    Returns:
    None
    """
    # Check if temperature column exists
    if 'Temperature' not in df.columns:
        print(f"Temperature data is missing in {dataset_name} dataset.")
        return
    
    # Prepare the temperature data
    temperature = df['Temperature']

    # Plot the temperature distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(temperature, bins=30, kde=True, color='skyblue')
    plt.title(f'Temperature Distribution in {dataset_name}')
    plt.xlabel('Temperature (°C)')
    plt.ylabel('Frequency')
    plt.show()

def plot_temperature_trends(df, dataset_name):
    """
    Plot the temperature trends over time for a given dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame containing temperature data with a date column.
    dataset_name (str): The name of the dataset (for plot title).

    Returns:
    None
    """
    # Check if necessary columns exist
    if 'Temperature' not in df.columns or 'Date' not in df.columns:
        print(f"Temperature or Date data is missing in {dataset_name} dataset.")
        return
    
    # Prepare the temperature and date data
    df['Date'] = pd.to_datetime(df['Date'])
    temperature = df['Temperature']
    date = df['Date']

    # Plot temperature trends
    plt.figure(figsize=(12, 6))
    plt.plot(date, temperature, color='coral')
    plt.title(f'Temperature Trends Over Time in {dataset_name}')
    plt.xlabel('Date')
    plt.ylabel('Temperature (°C)')
    plt.grid(True)
    plt.show()

def main():
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    # Perform temperature analysis for each dataset
    for name, df in datasets.items():
        print(f"\nTemperature Analysis for {name} Dataset:")
        
        # Plot temperature distribution
        plot_temperature_distribution(df, name)
        
        # Plot temperature trends
        plot_temperature_trends(df, name)




In [None]:
# Run the main function
if __name__ == "__main__":
    main()

# Histogram Analysis 

In [None]:
def plot_temperature_histogram(df, dataset_name):
    """
    Plot a histogram of the temperature data for a given dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame containing temperature data.
    dataset_name (str): The name of the dataset (for plot title).

    Returns:
    None
    """
    # Check if temperature column exists
    if 'Temperature' not in df.columns:
        print(f"Temperature data is missing in {dataset_name} dataset.")
        return
    
    # Prepare the temperature data
    temperature = df['Temperature']

    # Plot the histogram
    plt.figure(figsize=(10, 6))
    sns.histplot(temperature, bins=30, kde=False, color='skyblue')
    plt.title(f'Temperature Distribution Histogram in {dataset_name}')
    plt.xlabel('Temperature (°C)')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

def main():
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    # Plot histograms for each dataset
    for name, df in datasets.items():
        print(f"\nHistogram for {name} Dataset:")
        plot_temperature_histogram(df, name)




In [None]:
# Run the main function
if __name__ == "__main__":
    main()

# Z-Score Analysis Code

In [None]:

def calculate_z_scores(df, column_name):
    """
    Calculate the Z-scores for a given column in the DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    column_name (str): The name of the column for which to calculate Z-scores.

    Returns:
    pd.Series: Z-scores for the specified column.
    """
    mean = df[column_name].mean()
    std_dev = df[column_name].std()
    z_scores = (df[column_name] - mean) / std_dev
    return z_scores

def plot_z_score_distribution(df, dataset_name):
    """
    Plot the Z-score distribution for temperature data in a given dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame containing temperature data.
    dataset_name (str): The name of the dataset (for plot title).

    Returns:
    None
    """
    # Check if temperature column exists
    if 'Temperature' not in df.columns:
        print(f"Temperature data is missing in {dataset_name} dataset.")
        return
    
    # Calculate Z-scores
    z_scores = calculate_z_scores(df, 'Temperature')

    # Plot the Z-score distribution
    plt.figure(figsize=(10, 6))
    sns.histplot(z_scores, bins=30, kde=True, color='salmon')
    plt.title(f'Z-Score Distribution of Temperature in {dataset_name}')
    plt.xlabel('Z-Score')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()

def main():
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    # Perform Z-score analysis for each dataset
    for name, df in datasets.items():
        print(f"\nZ-Score Analysis for {name} Dataset:")
        plot_z_score_distribution(df, name)


In [None]:
# Run the main function
if __name__ == "__main__":
    main()

# Bubble Charts

In [None]:

def plot_bubble_chart(df, x_col, y_col, size_col, dataset_name):
    """
    Plot a bubble chart for temperature data with an additional dimension.

    Parameters:
    df (pd.DataFrame): The DataFrame containing temperature data.
    x_col (str): The column name for the x-axis (e.g., Date).
    y_col (str): The column name for the y-axis (e.g., Temperature).
    size_col (str): The column name for the bubble size (e.g., Frequency).
    dataset_name (str): The name of the dataset (for plot title).

    Returns:
    None
    """
    # Check if necessary columns exist
    if x_col not in df.columns or y_col not in df.columns or size_col not in df.columns:
        print(f"One or more columns are missing in {dataset_name} dataset.")
        return
    
    # Prepare data for plotting
    x_data = df[x_col]
    y_data = df[y_col]
    size_data = df[size_col]

    # Normalize size data for better visualization
    size_data_normalized = (size_data - size_data.min()) / (size_data.max() - size_data.min()) * 1000

    # Create bubble chart
    plt.figure(figsize=(12, 8))
    plt.scatter(x_data, y_data, s=size_data_normalized, alpha=0.6, edgecolors='w', cmap='viridis')
    plt.title(f'Bubble Chart of {y_col} vs {x_col} in {dataset_name}')
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.colorbar(label=size_col)
    plt.grid(True)
    plt.show()

def main():
    datasets = {
        "Benin": df_benin,
        "Sierra Leone": df_sierraleone,
        "Togo": df_togo
    }
    
    # Plot bubble charts for each dataset
    for name, df in datasets.items():
        print(f"\nBubble Chart for {name} Dataset:")
        
        # Plot bubble chart
        plot_bubble_chart(df, x_col='Date', y_col='Temperature', size_col='Frequency', dataset_name=name)


In [None]:
# Run the main function
if __name__ == "__main__":
    main()