In [None]:
import pandas as pd
import numpy as np 
import os 

import plotly.express as px
import plotly.graph_objs as go
from statsmodels.tsa.seasonal import seasonal_decompose


import seaborn as sns 
import matplotlib.pyplot as plt 
import glob 


print(os.getcwd())

## READING SECTION 
prd_files = glob.glob( '../data/generation/*.csv')
prd_in = pd.concat((pd.read_csv(file) for file in prd_files), ignore_index=True)

print(prd_in.shape)
prd_in.head()

In [16]:
col_renaming = {
       'Area': 'area', 
       'MTU':'time', 
       'Biomass  - Actual Aggregated [MW]': 'biomass',
       'Energy storage - Actual Aggregated [MW]' :'storage',
       'Fossil Brown coal/Lignite  - Actual Aggregated [MW]':'brown_coal',
       'Fossil Coal-derived gas  - Actual Aggregated [MW]':'coal',
       'Fossil Gas  - Actual Aggregated [MW]':'gas',
       'Fossil Hard coal  - Actual Aggregated [MW]':'hard_coal',
       'Fossil Oil  - Actual Aggregated [MW]':'oil',
       'Fossil Oil shale  - Actual Aggregated [MW]':'oil',
       'Fossil Peat  - Actual Aggregated [MW]':'peat',
       'Geothermal  - Actual Aggregated [MW]':'geothermal',
       'Hydro Pumped Storage  - Actual Aggregated [MW]':'hydro_pumped',
       'Hydro Pumped Storage  - Actual Consumption [MW]':'hydro_pumped_consumption',
       'Hydro Run-of-river and poundage  - Actual Aggregated [MW]':'hydro_run_of_river',
       'Hydro Water Reservoir  - Actual Aggregated [MW]':'reservoir',
       'Marine  - Actual Aggregated [MW]':'marine', 
       'Nuclear  - Actual Aggregated [MW]':'nuclear',
       'Other  - Actual Aggregated [MW]':'other',
       'Other renewable  - Actual Aggregated [MW]':'other_renewable',
       'Solar  - Actual Aggregated [MW]': 'solar', 
       'Waste  - Actual Aggregated [MW]':'waste',
       'Wind Offshore  - Actual Aggregated [MW]':'wind_offshore',
       'Wind Onshore  - Actual Aggregated [MW]':'wind_onshore'
}

In [None]:
prd_work = prd_in.copy( deep = True)
# Column renaming 
prd_work.rename( columns = col_renaming, inplace = True)

prd_work['dt'] = prd_work['time'].apply( lambda x: x[:16])
prd_work['dt'] = pd.to_datetime( prd_work['dt'], dayfirst = True )

prd_work_char = prd_work.copy( deep = True )

#handle weird caracters 
null_chars = ['n/e', '' '-']
for n in null_chars:
    prd_work_char.replace( n , 0, inplace = True )

## CET handling 
prd_work_ts = prd_work_char.drop_duplicates( subset = 'dt' , keep = 'first')

# limit time
prd_work_ts = prd_work_ts[prd_work_ts.dt< '2024-10-01 00:00:00']
# fill small empty gaps  with previous ts 
prd_work_ts.ffill( inplace = True )
prd_work_ts.fillna( 0, inplace = True )

In [None]:
prd_work_ts.isnull().sum()

In [None]:
prd_work_ts.groupby ( prd_work_ts.dt.dt.year).count()

In [20]:
## Persist for later use 
prd_work_ts.to_csv( '..\\data\\curated_data\\generation_clean.csv', index = False )