# Exploratory Data Analysis of Solar Data

This notebook performs:

1. **Summary statistics & missing-value report**  
2. **Outlier detection & basic cleaning**  
3. **Time-series analysis** of irradiance & temperature  
4. **Cleaning impact** on module readings  
5. **Correlation & relationship analysis**  
6. **Wind and distribution analysis**  
7. **Histograms**  
8. **Bubble chart**
9. **Export cleaned data**

## Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import linregress

df = pd.read_csv("../data/benin-malanville.csv", parse_dates=["Timestamp"])
df.head()

## Summary Statistics & Missing-Value Report

- `df.describe()` on numeric columns  
- Count nulls (`df.isna().sum()`)  
- List columns with > 5% missing


In [None]:
# Summary stats
display(df.describe())

# Missing-value counts
null = df.isna().sum()
missing = null / len(df) * 100

print("Null counts per column:")
print(null)

print("\nColumns with >5% nulls:")
print(missing[missing > 5])


## Outlier Detection & Basic Cleaning

- Computation of Z-scores for key columns  
- Outlier flagging for |Z| > 3  
- Imputation of missing values in key columns with median  
- Exporting the cleaned DataFrame


In [None]:
cols = ["GHI","DNI","DHI","ModA","ModB","WS","WSgust"]

# Z-scores & flagging
for c in cols:
    df[f"{c}_z"] = (df[c] - df[c].mean()) / df[c].std(ddof=0)
    df[f"{c}_outlier"] = df[f"{c}_z"].abs() > 3

df["any_outlier"] = df[[f"{c}_outlier" for c in cols]].any(axis=1)

# Median imputation
for c in cols + ["Tamb","RH","WD","BP"]:
    if c in df:
        df[c].fillna(df[c].median(), inplace=True)

# Save cleaned
df.to_csv("../data/benin-malanville_clean.csv", index=False)
print("Cleaned data exported to data/benin-malanville_clean.csv")

df_clean = pd.read_csv("../data/benin-malanville_clean.csv", parse_dates=["Timestamp"])
df_clean.head()


## Time-Series Plots

Plot of GHI, DNI, DHI & Tamb vs. Timestamp  

In [None]:
# Line plots
for c in ["GHI","DNI","DHI","Tamb"]:
    plt.figure()
    plt.plot(df["Timestamp"], df_clean[c])
    plt.title(f"{c} over time")
    plt.xlabel("Timestamp")
    plt.ylabel(c)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

## Anomality check

In [None]:

# Monthly average GHI
monthly = df_clean.set_index("Timestamp").resample("M")["GHI"].mean()
plt.figure()
plt.bar(monthly.index.month, monthly.values)
plt.title("Average Monthly GHI")
plt.xlabel("Month")
plt.ylabel("GHI")
plt.xticks(range(1,13))
plt.show()

# Monthly average DNI
monthly = df_clean.set_index("Timestamp").resample("M")["DNI"].mean()
plt.figure()
plt.bar(monthly.index.month, monthly.values)
plt.title("Average Monthly DNI")
plt.xlabel("Month")
plt.ylabel("DNI")
plt.xticks(range(1,13))
plt.show()

# Monthly average DHI
monthly = df_clean.set_index("Timestamp").resample("M")["DHI"].mean()
plt.figure()
plt.bar(monthly.index.month, monthly.values)
plt.title("Average Monthly DHI")
plt.xlabel("Month")
plt.ylabel("DHI")
plt.xticks(range(1,13))
plt.show()

# Monthly average Tamb
monthly = df_clean.set_index("Timestamp").resample("M")["Tamb"].mean()
plt.figure()
plt.bar(monthly.index.month, monthly.values)
plt.title("Average Monthly Tamb")
plt.xlabel("Month")
plt.ylabel("Tamb")
plt.xticks(range(1,13))
plt.show()

## Impact of Cleaning

In [None]:
df["CleanFlag"]   = "Before"
df_clean["CleanFlag"] = "After"

df_combined_benin = pd.concat([df, df_clean], ignore_index=True)

# 4. Group by flag and compute means
avg = df_combined_benin.groupby("CleanFlag")[["ModA", "ModB"]].mean()

# 5. Plot
ax = avg.plot(kind="bar", rot=0)
ax.set_title("Average Module Readings: Before vs. After Cleaning")
ax.set_xlabel("Cleaning Status")
ax.set_ylabel("Mean Module Reading")
plt.tight_layout()
plt.show()
print(avg)

## Correlation & Scatter Plots

In [None]:
vars_corr = ["GHI","DNI","DHI","TModA","TModB"]
corr = df_clean[vars_corr].corr()

# Heatmap
plt.figure()
plt.imshow(corr, aspect="auto")
plt.colorbar()
plt.xticks(range(len(vars_corr)), vars_corr, rotation=45)
plt.yticks(range(len(vars_corr)), vars_corr)
plt.title("Correlation matrix")
plt.tight_layout()
plt.show()

# Scatter pairs
for x,y in [("WS","GHI"),("WD","GHI"),("RH","Tamb"),("RH","GHI")]:
    if x in df_clean and y in df:
        plt.figure()
        plt.scatter(df[x], df[y])
        plt.title(f"{x} vs. {y}")
        plt.xlabel(x)
        plt.ylabel(y)
        plt.tight_layout()
        plt.show()


## Wind and distribution analysis

In [None]:
if all(c in df_clean for c in ("WS","WD")):
    bins = np.arange(0,361,30)
    labels = bins[:-1] + 15
    df_clean["WD_bin"] = pd.cut(df["WD"], bins, right=False, labels=labels)
    wind = df_clean.groupby("WD_bin")["WS"].mean()
    angles = np.deg2rad(wind.index.astype(float))
    ax = plt.subplot(projection="polar")
    ax.bar(angles, wind.values, width=np.deg2rad(30), align="center")
    ax.set_theta_zero_location("N")
    ax.set_theta_direction(-1)
    plt.title("Wind Rose (mean WS by direction)")
    plt.tight_layout()
    plt.show()
else:
    print("WS or WD not in DataFrame.")


## Tempreature analysis

1. Plot of **RH vs. Tamb** with a regression line  
2. Plot of **RH vs. GHI** with a regression line  
3. Computation and display of the slope, intercept, and R² for each model

In [None]:
# Define the two relationships to explore
pairs = [("RH", "Tamb"), ("RH", "GHI")]

for x_col, y_col in pairs:
    # Drop NA
    mask = df_clean[x_col].notna() & df_clean[y_col].notna()
    x = df_clean.loc[mask, x_col]
    y = df_clean.loc[mask, y_col]
    
    # Fit linear regression
    res = linregress(x, y)
    line = res.intercept + res.slope * x
    
    # Plot
    plt.figure()
    plt.scatter(x, y, alpha=0.5)
    plt.plot(x, line, label=(
        f"y = {res.slope:.2f}x + {res.intercept:.2f}\n"
        f"R² = {res.rvalue**2:.2f}"
    ))
    plt.title(f"{y_col} vs. {x_col} with Regression Line")
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    # Print stats
    print(f"Regression for {y_col} ~ {x_col}:")
    print(f"  • Slope     = {res.slope:.4f}")
    print(f"  • Intercept = {res.intercept:.4f}")
    print(f"  • R²        = {res.rvalue**2:.4f}\n")

## Histograms & Bubble Chart

- Histograms of GHI and WS  
- Bubble: GHI vs. Tamb (bubble size = RH)


In [None]:
# Histograms
for c in ["GHI","WS"]:
    if c in df_clean:
        plt.figure()
        plt.hist(df[c].dropna())
        plt.title(f"Histogram of {c}")
        plt.xlabel(c)
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.show()

# Bubble chart
if all(c in df_clean for c in ("GHI","Tamb","RH")):
    plt.figure()
    plt.scatter(df_clean["GHI"], df_clean["Tamb"], s=df_clean["RH"])
    plt.title("GHI vs. Tamb (size=RH)")
    plt.xlabel("GHI")
    plt.ylabel("Tamb")
    plt.tight_layout()
    plt.show()
else:
    print("GHI, Tamb, or RH missing.")


## Correlation Coefficients

In [None]:
if "RH" in df_clean:
    if "Tamb" in df_clean:
        print("RH vs. Tamb correlation:", df_clean["RH"].corr(df_clean["Tamb"]))
    if "GHI" in df_clean:
        print("RH vs. GHI correlation:", df_clean["RH"].corr(df_clean["GHI"]))