In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import zscore 


df = pd.read_csv('../data/benin-malanville.csv')


## Summary Statistics & Missing-Value Report


In [None]:
numeric_cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust', 'Tamb', 'RH', 'WSstdev', 'WD' ,'WDstdev', 'BP', 'Cleaning','Precipitation', 'TModA'  ]
print(df[numeric_cols].describe())

                 GHI            DNI            DHI           ModA  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      240.559452     167.187516     115.358961     236.589496   
std       331.131327     261.710501     158.691074     326.894859   
min       -12.900000      -7.800000     -12.600000       0.000000   
25%        -2.000000      -0.500000      -2.100000       0.000000   
50%         1.800000      -0.100000       1.600000       4.500000   
75%       483.400000     314.200000     216.300000     463.700000   
max      1413.000000     952.300000     759.200000    1342.300000   

                ModB             WS         WSgust           Tamb  \
count  525600.000000  525600.000000  525600.000000  525600.000000   
mean      228.883576       2.121113       2.809195      28.179683   
std       316.536515       1.603466       2.029120       5.924297   
min         0.000000       0.000000       0.000000      11.000000   
25%         0.000000       1.0000

In [None]:
missing_report = df.isna().sum() / len(df) * 100  # Percentage of missing values
missing_above_5 = missing_report[missing_report > 5]  # Filter > 5%

print("Columns with >5% missing values (%):")
print(missing_above_5)

Columns with >5% missing values (%):
Comments    100.0
dtype: float64


## Outlier Detection & Basic Cleaning

### Outliner detection

In [None]:
cols_to_check = ['GHI', 'DNI', 'DHI']

z_scores = df[cols_to_check].apply(zscore, nan_policy='omit')
outliers_mask = np.abs(z_scores) > 3
print("=== Outliers per Column (|Z-score| > 3) ===")
for col in cols_to_check:
    col_outliers = df.loc[outliers_mask[col], col]
    
    if not col_outliers.empty:
        print(f"\nColumn: {col}")
        print("-----------------")
        print(col_outliers)
    else:
        print(f"\nColumn: {col} - No outliers detected.")


=== Outliers per Column (|Z-score| > 3) ===

Column: GHI
-----------------
671       1274.0
674       1349.0
676       1334.0
849       1253.0
850       1324.0
           ...  
517777    1268.0
520659    1280.0
520696    1244.0
522074    1262.0
522075    1289.0
Name: GHI, Length: 89, dtype: float64

Column: DNI - No outliers detected.

Column: DHI
-----------------
670       610.6
671       615.2
672       612.8
673       593.2
674       618.0
          ...  
524896    604.2
524900    591.5
524901    596.1
524916    599.5
524917    593.2
Name: DHI, Length: 3738, dtype: float64


### Missing values

In [None]:
missing_report = df[cols_to_check].isna().sum()
print("Missing values:\n", missing_report[missing_report > 0])

Missing values:
 Series([], dtype: int64)


### Incorrect entries

In [None]:
for col in ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'RH', 'WS', 'WSgust', 'WSstdev', 'BP', 'Cleaning', 'Precipitation']:
    negatives = (df[col] < 0).sum()
    if negatives > 0:
        print(f"Warning: {negatives} negative values found in '{col}'")




In [None]:
## Cleaning

In [None]:

## invalid negative values
invalid_negative_cols = [
    'GHI', 'DNI', 'DHI'
]
# Replace negatives with NaN
df[invalid_negative_cols] = df[invalid_negative_cols].applymap(lambda x: x if x >= 0 else None)


In [None]:
## fix outliers
df[cols_to_check] = df[cols_to_check].mask(outliers_mask, np.nan)

In [None]:
## Dropping the Empty Comments Column

df.drop(columns=['Comments'], inplace=True)

## Export the Cleaned DataFrame

In [None]:
output_path = "../data/benin-malanville_clean.csv"
df.to_csv(output_path)

In [None]:
df_clean = pd.read_csv('../data/benin-malanville_clean.csv')