In [6]:
df = pd.read_csv(r'C:/code/solar-challenge-week1/data/togo_raw.csv', sep='\t', engine='python')

In [7]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure the 'outputs' directory exists
os.makedirs('C:/code/solar-challenge-week1/outputs', exist_ok=True)
os.makedirs('C:/code/solar-challenge-week1/outputs/plots', exist_ok=True)  # Create subfolder for plots

# Load tab-separated CSV properly (using absolute path)
df = pd.read_csv(r'C:/code/solar-challenge-week1/data/togo_raw.csv', sep='\t', engine='python')

# Clean column names (remove extra spaces)
df.columns = [col.strip() for col in df.columns]

# Verify required columns exist
expected_columns = ['Timestamp', 'GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
missing = [col for col in expected_columns if col not in df.columns]

if missing:
    print("Missing columns:", missing)
else:
    # Convert Timestamp to datetime format (essential for time-series analysis)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

    # 1. DATA PROFILE
    print("=== MISSING VALUES ===")
    print(df.isna().sum())
    print("\n=== STATS ===")
    print(df.describe())

    # 2. CLEANING
    z_scores = np.abs(stats.zscore(df[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']].dropna()))
    df_clean = df[(z_scores < 3).all(axis=1)]  # Remove outliers

    # 3. EXPORT CLEAN DATA (using absolute path)
    df_clean.to_csv(r'C:/code/solar-challenge-week1/outputs/togo_clean.csv', index=False, encoding='utf-8')

    # 4. PLOTS (Save to outputs/plots/)
    # Time Series
    plt.figure(figsize=(12, 6))
    plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI', color='blue')
    plt.xlabel('Timestamp')
    plt.ylabel('GHI')
    plt.title('Togo GHI Trend')
    plt.legend()
    plt.xticks(rotation=45)
    plt.savefig(r'C:/code/solar-challenge-week1/outputs/plots/togo_ghi_trend.png')
    plt.close()

    # Heatmap (Ensure proper numerical columns)
    plt.figure(figsize=(10, 8))
    sns.heatmap(df_clean[['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']].corr(), annot=True, cmap='coolwarm')
    plt.title('Correlation Heatmap')
    plt.savefig(r'C:/code/solar-challenge-week1/outputs/plots/togo_corr_heatmap.png')
    plt.close()

=== MISSING VALUES ===
Timestamp             0
GHI                   0
DNI                   0
DHI                   0
ModA                  0
ModB                  0
Tamb                  0
RH                    0
WS                    0
WSgust                0
WSstdev               0
WD                    0
WDstdev               0
BP                    0
Cleaning              0
Precipitation         0
TModA                 0
TModB                 0
Comments         525600
dtype: int64

=== STATS ===
                           Timestamp            GHI            DNI  \
count                         525600  525600.000000  525600.000000   
mean   2022-04-25 12:00:30.000000768     230.555040     151.258469   
min              2021-10-25 00:01:00     -12.700000       0.000000   
25%              2022-01-24 06:00:45      -2.200000       0.000000   
50%              2022-04-25 12:00:30       2.100000       0.000000   
75%              2022-07-25 18:00:15     442.400000     246.400000   
max