In [2]:
import os
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
# Directory setup
raw_dir = '/workspace/COMP3610-Renewable-Energy-Prediction/data/raw'
processed_dir = '/workspace/COMP3610-Renewable-Energy-Prediction/data/processed'
os.makedirs(processed_dir, exist_ok=True)

In [4]:
def load_country_data(country):
    """Load raw data for a specific country"""
    file_path = f'{raw_dir}/{country}_Power_Generation.csv'
    df = pd.read_csv(file_path)
    print(f"Loaded {country} data with shape: {df.shape}")
    return df

In [5]:
def clean_france_data(df):
    """Clean and process France data"""
    # Convert 'n/e' to NaN for object columns
    object_cols = df.select_dtypes(include='object').columns[4:]  # Skip Area, MTU, DATETIME, YEAR
    df[object_cols] = df[object_cols].replace('n/e', np.nan)
    
    # Convert datetime and set as index for time-based operations
    df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%d.%m.%Y %H:%M')
    df = df.set_index('DATETIME')
    
    # Convert numeric columns that are stored as objects
    for col in object_cols:
        try:
            df[col] = pd.to_numeric(df[col])
        except:
            df = df.drop(col, axis=1)
    
    # Handle missing values
    # Solar - Actual Aggregated [MW] (68 missing values out of 67831 ~ 0.1%)
    if df['Solar  - Actual Aggregated [MW]'].isnull().sum() > 0:
        # Time-based interpolation for solar (follows daily patterns)
        df['Solar  - Actual Aggregated [MW]'] = df['Solar  - Actual Aggregated [MW]'].interpolate(method='time')
    
    # Wind Onshore - Actual Aggregated [MW] (82 missing values out of 67831 ~ 0.12%)
    if df['Wind Onshore  - Actual Aggregated [MW]'].isnull().sum() > 0:
        # Time-based interpolation for wind (follows weather patterns)
        df['Wind Onshore  - Actual Aggregated [MW]'] = df['Wind Onshore  - Actual Aggregated [MW]'].interpolate(method='time')
    
    # Feature engineering
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    
    # Select and keep only the required columns
    columns_to_keep = [
        "Area", "MTU", "YEAR", 
        "Solar  - Actual Aggregated [MW]", 
        "Wind Onshore  - Actual Aggregated [MW]"
    ]
    
    # Reset index to keep DATETIME as column
    df = df.reset_index()
    df = df[columns_to_keep]
    
    return df



In [6]:
def clean_italy_data(df):
    """Clean and process Italy data"""
    # Convert 'n/e' to NaN
    object_cols = df.select_dtypes(include='object').columns[4:]
    df[object_cols] = df[object_cols].replace('n/e', np.nan)
    
    # Drop Nuclear column (all 'n/e')
    if 'Nuclear  - Actual Aggregated [MW]' in df.columns:
        df = df.drop('Nuclear  - Actual Aggregated [MW]', axis=1)
    
    # Convert datetime
    df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%d.%m.%Y %H:%M')
    df = df.set_index('DATETIME')
    
    # Handle missing values
    # Solar - Actual Aggregated [MW] (68 missing values out of 67831 ~ 0.1%)
    if df['Solar  - Actual Aggregated [MW]'].isnull().sum() > 0:
        # Time-based interpolation for solar (follows daily patterns)
        df['Solar  - Actual Aggregated [MW]'] = df['Solar  - Actual Aggregated [MW]'].interpolate(method='time')
    
    # Wind Onshore - Actual Aggregated [MW] (82 missing values out of 67831 ~ 0.12%)
    if df['Wind Onshore  - Actual Aggregated [MW]'].isnull().sum() > 0:
        # Time-based interpolation for wind (follows weather patterns)
        df['Wind Onshore  - Actual Aggregated [MW]'] = df['Wind Onshore  - Actual Aggregated [MW]'].interpolate(method='time')
    
    # Fill high missingness columns
    if 'Hydro Pumped Storage  - Actual Consumption [MW]' in df.columns:
        df['Hydro Pumped Storage  - Actual Consumption [MW]'] = df['Hydro Pumped Storage  - Actual Consumption [MW]'].fillna(0)
    
    # Feature engineering
    df['hour'] = df.index.hour
    
    # Select and keep only the required columns
    columns_to_keep = [
        "Area", "MTU", "YEAR", 
        "Solar  - Actual Aggregated [MW]",  
        "Wind Onshore  - Actual Aggregated [MW]"
    ]
    
    # Reset index to keep DATETIME as column
    df = df.reset_index()
    df = df[columns_to_keep]
    
    return df



In [7]:
def clean_spain_data(df):
    """Clean and process Spain data"""
    # Convert datetime
    df['DATETIME'] = pd.to_datetime(df['DATETIME'], format='%d.%m.%Y %H:%M')
    df = df.set_index('DATETIME')
    
    # Handle missing values
    # Solar - Actual Aggregated [MW] (68 missing values out of 67831 ~ 0.1%)
    if df['Solar  - Actual Aggregated [MW]'].isnull().sum() > 0:
        # Time-based interpolation for solar (follows daily patterns)
        df['Solar  - Actual Aggregated [MW]'] = df['Solar  - Actual Aggregated [MW]'].interpolate(method='time')
    
    # Wind Onshore - Actual Aggregated [MW] (82 missing values out of 67831 ~ 0.12%)
    if df['Wind Onshore  - Actual Aggregated [MW]'].isnull().sum() > 0:
        # Time-based interpolation for wind (follows weather patterns)
        df['Wind Onshore  - Actual Aggregated [MW]'] = df['Wind Onshore  - Actual Aggregated [MW]'].interpolate(method='time')
    
    # Feature engineering
    df['hour'] = df.index.hour
    
    # Select and keep only the required columns
    columns_to_keep = [
        "Area", "MTU", "YEAR", 
        "Solar  - Actual Aggregated [MW]", 
        "Wind Onshore  - Actual Aggregated [MW]"
    ]
    
    # Reset index to keep DATETIME as column
    df = df.reset_index()
    df = df[columns_to_keep]
    
    return df



In [8]:
def post_cleaning_processing(df, country):
    """Common post-processing steps for all countries"""
    # Standardize column names
    df.columns = df.columns.str.replace(' - Actual Aggregated \[MW\]', '', regex=True)
    df.columns = df.columns.str.strip()
    
    # Add country identifier
    df['Country'] = country
    
    return df



In [9]:
# Main processing pipeline
countries = ['France', 'Italy', 'Spain']
cleaned_data = {}

for country in countries:
    print(f"\nProcessing {country} data...")
    
    try:
        # Load raw data
        raw_df = load_country_data(country)
        
        # Clean data using the appropriate function
        if country == 'France':
            cleaned_df = clean_france_data(raw_df)
        elif country == 'Italy':
            cleaned_df = clean_italy_data(raw_df)
        elif country == 'Spain':
            cleaned_df = clean_spain_data(raw_df)
        
        # Apply common post-cleaning processing
        cleaned_df = post_cleaning_processing(cleaned_df, country)
        
        # Save to dictionary
        cleaned_data[country] = cleaned_df
        
        # Save to processed folder
        output_path = f'{processed_dir}/{country}_Power_Generation_Cleaned.csv'
        cleaned_df.to_csv(output_path, index=False)
        print(f"Saved cleaned data to {output_path}")
        
    except Exception as e:
        print(f"Error processing {country}: {str(e)}")
        continue




Processing France data...
Loaded France data with shape: (67831, 25)


  df[object_cols] = df[object_cols].replace('n/e', np.nan)


Saved cleaned data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/France_Power_Generation_Cleaned.csv

Processing Italy data...
Loaded Italy data with shape: (59070, 25)


  df[object_cols] = df[object_cols].replace('n/e', np.nan)


Saved cleaned data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/Italy_Power_Generation_Cleaned.csv

Processing Spain data...
Loaded Spain data with shape: (76969, 25)
Saved cleaned data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/Spain_Power_Generation_Cleaned.csv


In [10]:
# Combine all countries if all processed successfully
if len(cleaned_data) == len(countries):
    combined_df = pd.concat(cleaned_data.values())
    combined_output_path = f'{processed_dir}/All_Countries_Power_Generation_Cleaned.csv'
    combined_df.to_csv(combined_output_path, index=False)
    print(f"\nSaved combined cleaned data to {combined_output_path}")
else:
    print("\nWarning: Not all countries processed successfully - skipping combined file")


Saved combined cleaned data to /workspace/COMP3610-Renewable-Energy-Prediction/data/processed/All_Countries_Power_Generation_Cleaned.csv
