In [None]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the 'outputs' directory exists
os.makedirs('outputs', exist_ok=True)
os.makedirs('outputs/plots', exist_ok=True)  # Create subfolder for plots

# Load tab-separated CSV properly
df = pd.read_csv(r'C:/code/solar-challenge-week1/data/benin_raw.csv', sep='\t', engine='python')

# Clean column names (remove extra spaces)
df.columns = [col.strip() for col in df.columns]

# Verify required columns exist
expected_columns = ['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
missing = [col for col in expected_columns if col not in df.columns]

if missing:
    print("Missing columns:", missing)
else:
    # Convert Timestamp to datetime format (essential for time-series analysis)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

    # 1. DATA PROFILE
    print("=== MISSING VALUES ===")
    print(df.isna().sum())
    print("\n=== STATS ===")
    print(df.describe())

    # 2. CLEANING
    z_scores = np.abs(stats.zscore(df[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']].dropna()))
    df_clean = df[(z_scores < 3).all(axis=1)]  # Remove outliers

    # 3. EXPORT CLEAN DATA
    df_clean.to_csv(r'outputs/benin_clean.csv', index=False, encoding='utf-8')


    # 4. PLOTS (Save to outputs/plots/)
    
    # Time Series
    plt.figure(figsize=(12, 6))
    plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI', color='blue')
    plt.xlabel('Timestamp')
    plt.ylabel('GHI')
    plt.title('Benin GHI Trend')
    plt.legend()
    plt.xticks(rotation=45)
    plt.savefig(r'outputs/plots/benin_ghi_trend.png')
    plt.close()

    # Heatmap (Ensure proper numerical columns)
    plt.figure(figsize=(10, 8))
    sns.heatmap(df_clean[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.savefig(r'outputs/plots/benin_corr_heatmap.png')
    plt.close()