In [6]:
# notebooks/data_cleaning_preprocessing.ipynb

import os
import pandas as pd
import numpy as np
from datetime import datetime

# Ensure processed directory exists
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

## Create a Data Loading Function

In [7]:
def load_country_data(country):
    """Load raw data for a specific country"""
    file_path = f'../data/raw/{country}_Power_Generation.csv'
    df = pd.read_csv(file_path)
    print(f"Loaded {country} data with shape: {df.shape}")
    return df

## Create a Data Cleaning Function

In [8]:
def clean_energy_data(df, country):
    """
    Clean and preprocess energy data for a specific country
    Returns cleaned DataFrame
    """
    # Make a copy to avoid SettingWithCopyWarning
    df_clean = df.copy()
    
    # 1. Select and rename columns
    columns_to_keep = [
        "Area", 
        "MTU", 
        "DATETIME", 
        "YEAR", 
        "Solar  - Actual Aggregated [MW]", 
        "Wind Offshore  - Actual Aggregated [MW]", 
        "Wind Onshore  - Actual Aggregated [MW]"
    ]
    
    # Strip extra spaces from column names
    df_clean.columns = df_clean.columns.str.replace(r'\s+', ' ').str.strip()
    
    # Verify column names
    print("\nColumn names after cleaning:")
    for col in df_clean.columns:
        print(f"'{col}' - length: {len(col)}")
    
    # Keep only specified columns
    df_clean = df_clean[columns_to_keep]
    
    # 2. Convert DATETIME to proper format
    df_clean['DATETIME'] = pd.to_datetime(
        df_clean['DATETIME'], 
        format='%d.%m.%Y %H:%M', 
        errors='coerce'
    )
    print("\nDATETIME conversion samples:")
    print(df_clean['DATETIME'].head())
    
    # 3. Handle missing values with custom fill function
    def fill_with_avg(df, column_name):
        if column_name in df.columns:
            df[column_name] = df[column_name].fillna(
                (df[column_name].shift(1) + df[column_name].shift(-1)) / 2
            )
        else:
            print(f"Column '{column_name}' not found!")
    
    # Apply to all energy columns
    energy_columns = [
        'Solar - Actual Aggregated [MW]',
        'Wind Offshore - Actual Aggregated [MW]',
        'Wind Onshore - Actual Aggregated [MW]'
    ]
    
    for col in energy_columns:
        fill_with_avg(df_clean, col)
    
    # 4. Additional cleaning
    # Add country identifier
    df_clean['country'] = country
    
    # Remove duplicate rows based on DATETIME
    df_clean = df_clean.drop_duplicates(subset=['DATETIME'])
    
    # Sort by datetime
    df_clean = df_clean.sort_values('DATETIME')
    
    # Set DATETIME as index
    df_clean.set_index('DATETIME', inplace=True)
    
    # 5. Final validation
    print("\nMissing values after cleaning:")
    print(df_clean.isnull().sum())
    
    print(f"\nFinal DataFrame shape: {df_clean.shape}")
    return df_clean

## Process Each Country's Data

In [9]:
# List of countries in your dataset
countries = ['France', 'Italy', 'Spain']

# Dictionary to hold all cleaned DataFrames
cleaned_data = {}

for country in countries:
    print(f"\nProcessing {country} data...")
    # Load raw data
    raw_df = load_country_data(country)
    
    # Clean data
    cleaned_df = clean_energy_data(raw_df, country)
    
    # Save to dictionary
    cleaned_data[country] = cleaned_df
    
    # Save to processed folder
    output_path = f'{processed_dir}/{country}_Power_Generation_Cleaned.csv'
    cleaned_df.to_csv(output_path)
    print(f"Saved cleaned data to {output_path}")


Processing France data...
Loaded France data with shape: (67831, 25)

Column names after cleaning:
'Area' - length: 4
'MTU' - length: 3
'DATETIME' - length: 8
'YEAR' - length: 4
'Biomass  - Actual Aggregated [MW]' - length: 33
'Fossil Brown coal/Lignite  - Actual Aggregated [MW]' - length: 51
'Fossil Coal-derived gas  - Actual Aggregated [MW]' - length: 49
'Fossil Gas  - Actual Aggregated [MW]' - length: 36
'Fossil Hard coal  - Actual Aggregated [MW]' - length: 42
'Fossil Oil  - Actual Aggregated [MW]' - length: 36
'Fossil Oil shale  - Actual Aggregated [MW]' - length: 42
'Fossil Peat  - Actual Aggregated [MW]' - length: 37
'Geothermal  - Actual Aggregated [MW]' - length: 36
'Hydro Pumped Storage  - Actual Aggregated [MW]' - length: 46
'Hydro Pumped Storage  - Actual Consumption [MW]' - length: 47
'Hydro Run-of-river and poundage  - Actual Aggregated [MW]' - length: 57
'Hydro Water Reservoir  - Actual Aggregated [MW]' - length: 47
'Marine  - Actual Aggregated [MW]' - length: 32
'Nucle

## Create Combined Dataset

In [10]:
# Combine all country data
combined_df = pd.concat(cleaned_data.values())

# Save combined data
combined_path = f'{processed_dir}/EU_Power_Generation_Combined_Cleaned.csv'
combined_df.to_csv(combined_path)
print(f"Saved combined cleaned data to {combined_path}")

Saved combined cleaned data to ../data/processed/EU_Power_Generation_Combined_Cleaned.csv
