In [1]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import plotly.express as px


# 2. Load Dataset
df = pd.read_csv("../data/benin.csv", parse_dates=['Timestamp'], encoding='ISO-8859-1')

# Quick view
df.head()


  df = pd.read_csv("../data/benin.csv", parse_dates=['Timestamp'], encoding='ISO-8859-1')
  df = pd.read_csv("../data/benin.csv", parse_dates=['Timestamp'], encoding='ISO-8859-1')


Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,yyyy-mm-dd hh:mm,W/m²,W/m²,W/m²,W/m²,W/m²,°C,%,m/s,m/s,m/s,°N (to east),°,hPa,1 or 0,mm/min,°C,°C,
1,2021-08-09 00:01,-1.2,-0.2,-1.1,0,0,26.2,93.4,0,0.4,0.1,122.1,0,998,0,0,26.3,26.2,
2,2021-08-09 00:02,-1.1,-0.2,-1.1,0,0,26.2,93.6,0,0,0,0,0,998,0,0,26.3,26.2,
3,2021-08-09 00:03,-1.1,-0.2,-1.1,0,0,26.2,93.7,0.3,1.1,0.5,124.6,1.5,997,0,0,26.4,26.2,
4,2021-08-09 00:04,-1.1,-0.1,-1,0,0,26.2,93.3,0.2,0.7,0.4,120.3,1.3,997,0,0,26.4,26.3,


In [2]:
# Summary statistics
df.describe()

# Missing values count
missing = df.isna().sum()
missing[missing > 0]


Comments    525601
dtype: int64

In [3]:
outlier_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# Convert to numeric (strings -> NaN)
for col in outlier_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with NaN in those columns (optional but safer)
df = df.dropna(subset=outlier_cols)

# Calculate z-scores and remove outliers
z_scores = df[outlier_cols].apply(zscore)
mask = (np.abs(z_scores) > 3).any(axis=1)
df_outliers_removed = df[~mask].copy()

print(f"Removed {mask.sum()} outliers.")


Removed 7740 outliers.


In [None]:
# Plot GHI, DNI, DHI, Tamb over time
plt.figure(figsize=(14, 6))
for col in ['GHI', 'DNI', 'DHI', 'Tamb']:
    plt.plot(df_outliers_removed['Timestamp'], df_outliers_removed[col], label=col)

plt.xlabel("Timestamp")
plt.ylabel("Values")
plt.title("Solar Irradiance & Temperature Over Time - Benin")
plt.legend()
plt.tight_layout()
plt.show()


TypeError: 'value' must be an instance of str or bytes, not a float

In [None]:
# Ensure 'Cleaning' column exists
if 'Cleaning' in df_outliers_removed.columns:
    mod_means = df_outliers_removed.groupby('Cleaning')[['ModA', 'ModB']].mean()

    mod_means.plot(kind='bar', figsize=(8, 4))
    plt.title("ModA & ModB - Cleaning vs No Cleaning")
    plt.ylabel("Average Value")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()
else:
    print("Column 'Cleaning' not found in dataset.")


In [None]:
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB', 'Tamb', 'RH']
corr_matrix = df_outliers_removed[corr_cols].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Heatmap - Radiation, Temperature & Humidity")
plt.tight_layout()
plt.show()


In [None]:
# Plot wind rose using plotly
fig = px.bar_polar(
    df_outliers_removed.dropna(subset=['WD', 'WS']),
    r="WS",
    theta="WD",
    color="WS",
    color_continuous_scale=px.colors.sequential.Viridis,
    title="Wind Rose - Speed vs Direction"
)
fig.show()


In [None]:
df_outliers_removed.to_csv("data/benin_clean.csv", index=False)
print("✅ Cleaned data exported to data/benin_clean.csv")
