In [1]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Import our utility functions
from opsd_utils import (
    download_opsd_data, 
    extract_opsd_data, 
    load_opsd_data,
    setup_plotting_style
)

# Set up plotting style
setup_plotting_style()

print("Environment setup complete.")


Environment setup complete.


In [2]:
# Download the dataset
zip_path = download_opsd_data(data_dir="../data")

# Extract the dataset
extracted_files = extract_opsd_data(zip_path, data_dir="../data")

print(f"\nExtracted {len(extracted_files)} files:")
for file in extracted_files:
    print(f"  - {file}")


Auto-discovered latest OPSD version: 2020-10-06
Downloading OPSD Time Series data from: https://data.open-power-system-data.org/time_series/opsd-time_series-2020-10-06.zip
Downloaded: ../data/time_series.zip
Extracted: opsd-time_series-2020-10-06/time_series_15min_singleindex.csv
Extracted: opsd-time_series-2020-10-06/time_series_30min_singleindex.csv
Extracted: opsd-time_series-2020-10-06/time_series_60min_singleindex.csv

Extracted 3 files:
  - ../data/opsd-time_series-2020-10-06/time_series_15min_singleindex.csv
  - ../data/opsd-time_series-2020-10-06/time_series_30min_singleindex.csv
  - ../data/opsd-time_series-2020-10-06/time_series_60min_singleindex.csv


In [3]:
# Load the main time series data
df = load_opsd_data(data_dir="../data")

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df.index.min()} to {df.index.max()}")
print(f"Time frequency: {df.index.freq}")

# Display basic info
print("\nDataset info:")
df.info()


Loading main time series file: ../data/opsd-time_series-2020-10-06/time_series_60min_singleindex.csv
Loaded data shape: (50401, 299)
Date range: 2014-12-31 23:00:00+00:00 to 2020-09-30 23:00:00+00:00
Dataset shape: (50401, 299)
Date range: 2014-12-31 23:00:00+00:00 to 2020-09-30 23:00:00+00:00
Time frequency: None

Dataset info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 50401 entries, 2014-12-31 23:00:00+00:00 to 2020-09-30 23:00:00+00:00
Columns: 299 entries, cet_cest_timestamp to UA_load_forecast_entsoe_transparency
dtypes: float64(298), object(1)
memory usage: 115.4+ MB


In [4]:
# Display first few rows
print("First 5 rows:")
df.head()


First 5 rows:


Unnamed: 0_level_0,cet_cest_timestamp,AT_load_actual_entsoe_transparency,AT_load_forecast_entsoe_transparency,AT_price_day_ahead,AT_solar_generation_actual,AT_wind_onshore_generation_actual,BE_load_actual_entsoe_transparency,BE_load_forecast_entsoe_transparency,BE_solar_generation_actual,BE_wind_generation_actual,...,SI_load_actual_entsoe_transparency,SI_load_forecast_entsoe_transparency,SI_solar_generation_actual,SI_wind_onshore_generation_actual,SK_load_actual_entsoe_transparency,SK_load_forecast_entsoe_transparency,SK_solar_generation_actual,SK_wind_onshore_generation_actual,UA_load_actual_entsoe_transparency,UA_load_forecast_entsoe_transparency
utc_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31 23:00:00+00:00,2015-01-01T00:00:00+0100,,,,,,,,,,...,,,,,,,,,,
2015-01-01 00:00:00+00:00,2015-01-01T01:00:00+0100,5946.0,6701.0,35.0,,69.0,9484.0,9897.0,,,...,,,,,,,,,,
2015-01-01 01:00:00+00:00,2015-01-01T02:00:00+0100,5726.0,6593.0,45.0,,64.0,9152.0,9521.0,,734.81,...,1045.47,816.0,,1.17,2728.0,2860.0,3.8,,,
2015-01-01 02:00:00+00:00,2015-01-01T03:00:00+0100,5347.0,6482.0,41.0,,65.0,8799.0,9135.0,,766.64,...,1004.79,805.0,,1.04,2626.0,2810.0,3.8,,,
2015-01-01 03:00:00+00:00,2015-01-01T04:00:00+0100,5249.0,6454.0,38.0,,64.0,8567.0,8909.0,,733.13,...,983.79,803.0,,1.61,2618.0,2780.0,3.8,,,


In [5]:
# Categorize columns by type
load_cols = [col for col in df.columns if 'load' in col.lower()]
wind_cols = [col for col in df.columns if 'wind' in col.lower()]
solar_cols = [col for col in df.columns if 'solar' in col.lower() or 'pv' in col.lower()]
price_cols = [col for col in df.columns if 'price' in col.lower()]

print(f"Load columns ({len(load_cols)}): {load_cols[:3]}...")
print(f"Wind columns ({len(wind_cols)}): {wind_cols[:3]}...")
print(f"Solar columns ({len(solar_cols)}): {solar_cols[:3]}...")
print(f"Price columns ({len(price_cols)}): {price_cols[:3]}...")


Load columns (114): ['AT_load_actual_entsoe_transparency', 'AT_load_forecast_entsoe_transparency', 'BE_load_actual_entsoe_transparency']...
Wind columns (105): ['AT_wind_onshore_generation_actual', 'BE_wind_generation_actual', 'BE_wind_offshore_generation_actual']...
Solar columns (47): ['AT_solar_generation_actual', 'BE_solar_generation_actual', 'BG_solar_generation_actual']...
Price columns (32): ['AT_price_day_ahead', 'DE_LU_price_day_ahead', 'DK_1_price_day_ahead']...


In [6]:
# Create summary for the analysis report
summary_info = {
    'Dataset Shape': df.shape,
    'Date Range': f"{df.index.min()} to {df.index.max()}",
    'Total Columns': len(df.columns),
    'Load Columns': len(load_cols),
    'Wind Columns': len(wind_cols),
    'Solar Columns': len(solar_cols),
    'Price Columns': len(price_cols)
}

print("Data Organization Summary:")
print("=" * 40)
for key, value in summary_info.items():
    print(f"{key}: {value}")

# Initialize the analysis report
report_path = "../output/reports/analysis_report.md"
Path(report_path).parent.mkdir(parents=True, exist_ok=True)

with open(report_path, 'w') as f:
    f.write("# OPSD Time Series Analysis Report\n\n")
    f.write("## 1. Data Download and Organization\n\n")
    f.write("### Dataset Overview\n\n")
    
    for key, value in summary_info.items():
        f.write(f"- **{key}**: {value}\n")
    
    f.write("\n---\n\n")

print(f"\nSummary saved to: {report_path}")


Data Organization Summary:
Dataset Shape: (50401, 299)
Date Range: 2014-12-31 23:00:00+00:00 to 2020-09-30 23:00:00+00:00
Total Columns: 299
Load Columns: 114
Wind Columns: 105
Solar Columns: 47
Price Columns: 32

Summary saved to: ../output/reports/analysis_report.md
