In [None]:
import sys, os
sys.path.insert(0, os.path.abspath('..'))  # Ensure project root is in path

from scripts.data_loading import load_data
from scripts.data_summary import summarize_statistics, report_missing_values
from scripts.outlier_detection import compute_z_scores, plot_zscore_distribution, plot_boxplots
from scripts.data_cleaning import handle_negative_irradiance, clean_outliers_and_missing
from scripts.eda_plots import (
    plot_time_series, plot_monthly_avg, plot_cleaning_impact,
    plot_correlation_heatmap, plot_scatter_plots, plot_wind_rose,
    plot_histograms, plot_temperature_analysis, plot_bubble_chart
)

import pandas as pd

# Create figures directory if it doesn't exist
os.makedirs('figures', exist_ok=True)

# Load data
filepath = "../data/sierraleone-bumbuna.csv"
df = load_data(filepath)

# Define columns for correlation and scatter plots
corr_columns = ["GHI", "DNI", "DHI", "TModA", "TModB", "WS", "Tamb"]
scatter_pairs = [("GHI", "DNI"), ("DNI", "DHI"), ("ModA", "ModB"), ("WS", "GHI"), ("Tamb", "GHI"), ("RH", "Tamb")]

# Summary and missing values
summarize_statistics(df)
report_missing_values(df)

# Outlier detection
columns_to_check = ["GHI", "DNI", "DHI", "ModA", "ModB", "WS", "WSgust"]
df_z = compute_z_scores(df, columns_to_check)
plot_zscore_distribution(df_z, columns_to_check)
plot_boxplots(df_z, columns_to_check)

# Cleaning
irradiance_columns = ["GHI", "DNI", "DHI"]
df = handle_negative_irradiance(df, irradiance_columns)
df_clean = clean_outliers_and_missing(df_z, columns_to_check)
df_clean.to_csv("../data/sierraleone-bumbuna_clean.csv", index=False)

# Set datetime index
df_clean["Timestamp"] = pd.to_datetime(df_clean["Timestamp"])
df_clean.set_index("Timestamp", inplace=True)

# Filter for daytime values
df_daytime = df_clean.between_time("06:00", "18:00")

# EDA plots
plot_time_series(df_daytime)
plot_monthly_avg(df_clean)
plot_cleaning_impact(df_clean)
plot_correlation_heatmap(df_clean, corr_columns)
plot_scatter_plots(df_clean, scatter_pairs)
plot_wind_rose(df_clean)
plot_histograms(df_clean)
plot_temperature_analysis(df_clean)
plot_bubble_chart(df_daytime)