In [None]:
import pandas as pd
import numpy as np 
import os 

import plotly.express as px
import plotly.graph_objs as go
from statsmodels.tsa.seasonal import seasonal_decompose


import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose 


import seaborn as sns 
import matplotlib.pyplot as plt 
import glob 


# Contenus 
Les données sont telechargées depuis ENTSOE Transparency Platform et representent respectivement la charge electrique suisse et la production par source 
- Lecture et nettoyage des données brutes
- Points d'attentions sur le traitement du temps
- Analyse exploratoire
- Decomposition en Trend/Saisonnalité/Residus

### Data Import


In [None]:
load_files = glob.glob( '../data/load/*.csv')

load_in = pd.concat((pd.read_csv(file) for file in load_files), ignore_index=True)

print(load_in.shape)
load_in.head()

In [None]:
load_in.tail()

In [None]:
load_in.info()

In [None]:
load_work = load_in.copy(deep = True)
## Change column names 
load_work.columns = ['time','load_forecast','load']
## Parse correctly the timestamp 

def parse_ts( df, column_name ):

    ## Parses the timestamp from this format  e.g., '01.01.2019 00:00 - 01.01.2019 01:00'

    df[['start','end']] = df[column_name].str.split('-' , expand = True )
    df['dt'] = pd.to_datetime( df['start'], dayfirst=True, errors = 'coerce')

    df.drop( columns = ['start','end',column_name ], inplace = True)


    return df


## Parsing timestamps 
print( load_work.shape )
load_work_parsed = parse_ts( load_work , 'time')
print( load_work_parsed.shape) 

## Parsing numeric 

load_work_parsed['load'] = pd.to_numeric( load_work_parsed['load'] , errors = 'coerce' ) 
load_work_parsed['load_forecast'] = pd.to_numeric( load_work_parsed['load_forecast'] , errors = 'coerce' )


In [None]:
load_work_parsed.info()

In [None]:
## Attention to the null values !! 
load_work_parsed.isnull().sum()

In [None]:
# Voir les nulls
sns.heatmap( load_work_parsed.isnull()  )
plt.title( 'Null Values for Load')

In [None]:
## Strategie: utiliser un mois complet jusqu'en octobre 
load_clean = load_work_parsed[load_work_parsed.dt < '2024-10-01 00:00:00']

load_clean.tail()

In [None]:
## Traitement du temps 

## Attention au referentiel de temps : CET 
## Controle des doublons
## Controle des valeurs manquants 

load_clean.groupby( 'dt' ).count().sort_values( by = 'load' , ascending = False).head(20)

In [None]:
## Attention doit etre mise ! 

load_df = load_clean.copy( deep = True)
print( load_df.shape ) 
load_df.drop_duplicates( subset = 'dt',  inplace = True , keep = 'first')
print( load_df.shape ) 

In [None]:
load_df[load_df.load.isnull()==True]

In [None]:
# Examiner un cas particulier 

load_df[load_df.dt>'2023-03-26 00:00:00'].head(10)

In [None]:
# Controler les nombre de timestamps 
load_df.groupby( load_df.dt.dt.year).count()

In [None]:
## Simplement ignorer les valeurs nulles
# Il faudrait etre tres attentifs dans des cas de production à l'impact de tout cela
#  
load_df = load_df.dropna()

In [None]:
load_df.groupby( load_df.dt.dt.year).count()

In [None]:
load_df[load_df.dt>'2023-03-26 00:00:00'].head(10)

In [None]:
load_df.isnull().sum()

In [None]:
## Persist for later use 
load_df.to_csv( '..\\data\\curated_data\\load_clean.csv', index = False )

## Attention 
Jamais sousestimer cette partie car elle peut donner pas mal de problemes si elle n'est pas faite correctement ! 

Connaitre la matiere premiere est primordiale pour effectuer des bonnes analyses 

Ressources d'interet
- https://fr.wikipedia.org/wiki/Temps_universel_coordonn%C3%A9 
- https://fr.wikipedia.org/wiki/Heure_normale_d%27Europe_centrale 
- https://www.forecastclub.blog/2024/02/how-to-handle-time-series-missing-data.html 

## Data Quality and exploration 

In [None]:
px.line( data_frame = load_df , x = 'dt' , y = 'load' , title = 'Demande electrique en Suisse [MW]')

In [None]:
#sns.lineplot( data = load_df[:8000] , x = 'dt' , y = 'load' )

In [None]:
import pandas as pd
import plotly.express as px

def plot_timeseries_with_granularity(df_in, ts_column, column, granularity, hue=None):
    """
    Plot a Plotly line plot with granularity on the x-axis and the average of a column on the y-axis.
    
    Parameters:
    - df: DataFrame containing the data.
    - ts_column: The column containing the timestamp or datetime.
    - column: The column for which to calculate the average on the y-axis.
    - granularity: The granularity for the x-axis (e.g., 'week', 'dayofweek', 'month', 'hour').
    - hue: Optional; a column to differentiate lines in the plot (e.g., 'year', 'quarter').
    """
    # Ensure ts_column is a datetime object

    df = df_in.copy( deep = True )
    df[ts_column] = pd.to_datetime(df[ts_column])

    # Extract the desired granularity
    if granularity == 'week':
        df['granularity'] = df[ts_column].dt.isocalendar().week
    elif granularity == 'dayofweek':
        df['granularity'] = df[ts_column].dt.dayofweek
    elif granularity == 'month':
        df['granularity'] = df[ts_column].dt.month
    elif granularity == 'hour':
        df['granularity'] = df[ts_column].dt.hour
    else:
        raise ValueError("Invalid granularity. Choose from 'week', 'dayofweek', 'month', 'hour'.")

    # Optionally add 'year', 'quarter', or other hue options if provided
    if hue == 'year':
        df['hue'] = df[ts_column].dt.year
    elif hue == 'quarter':
        df['hue'] = df[ts_column].dt.quarter

    # Group by granularity and hue (if provided) and calculate the mean of the column
    if hue:
        grouped = df.groupby(['granularity', 'hue'])[column].mean().reset_index()
    else:
        grouped = df.groupby(['granularity'])[column].mean().reset_index()

    # Plot the graph using Plotly
    if hue:
        fig = px.line(grouped, x='granularity', y=column, color='hue',
                      labels={'granularity': granularity.capitalize(), column: f'Average {column}', 'hue': hue.capitalize()},
                      title=f'Average {column} by {granularity}')
    else:
        fig = px.line(grouped, x='granularity', y=column,
                      labels={'granularity': granularity.capitalize(), column: f'Average {column}'},
                      title=f'Average {column} by {granularity}')

    # Show the interactive plot
    fig.show()



## EDA Analysis  


In [None]:
sns.set_palette('tab10')
plot_timeseries_with_granularity( load_df, ts_column = 'dt', column = 'load' , granularity= 'week' , hue = 'year')

In [None]:
sns.set_palette('tab10')
## Attention au dataset, combien de semaines en 2024?
 
plot_timeseries_with_granularity( load_df, 
                                  ts_column = 'dt', 
                                  column = 'load' ,
                                  granularity= 'dayofweek' , 
                                  hue = 'year')

In [None]:
sns.set_palette('tab10')
plot_timeseries_with_granularity( load_df, 
                                  ts_column = 'dt', 
                                  column = 'load' , 
                                  granularity= 'hour' , 
                                  hue = 'year')

## Trend, Seasonality, Residual

In [None]:
def simple_seasonal_decompose(df, ts_column, column, model='additive', freq=None):
    """
    Decomposes a time series into trend, seasonal, and residual components, and plots them using Matplotlib.
    
    Parameters:
    - df: DataFrame containing the data.
    - ts_column: The column containing the timestamp or datetime.
    - column: The column containing the time series values.
    - model: The type of decomposition ('additive' or 'multiplicative').
    - freq: Frequency of the time series (if not set, inferred automatically).
    """
    # Ensure ts_column is a datetime object and set it as the index
    df[ts_column] = pd.to_datetime(df[ts_column])
    df = df.set_index(ts_column)

    # Perform seasonal decomposition
    decomposition = seasonal_decompose(df[column], model=model, period=freq, extrapolate_trend=True)

    # Plot the decomposition components (observed, trend, seasonal, residual)
    decomposition.plot()
    plt.show()

    return decomposition 

In [None]:
monthly_load_df = load_df.set_index( 'dt').resample('MS').sum().reset_index()
monthly_load_df

In [None]:
decomposed  = simple_seasonal_decompose( monthly_load_df , ts_column= 'dt', column = 'load')

In [None]:
# La comprehension des residues peut aider à voir si la decomposition simple ne capture pas l'entierté du comportement, ou des effets 
#decomposed.resid.plot()
#decomposed.resid.hist()

In [None]:
## Time series differentes peuvent avoir comportement differents !  

### Generation 
L'analyse des time serie temporelle peut donner des resultats differents selon la nature du phenomene observé

In [None]:
## Preparation deja faite auparavant
prd_in = pd.read_csv('..\\data\\curated_data\\generation_clean.csv')
prd_in['dt'] = pd.to_datetime( prd_in['dt'])
 
prd_df  = prd_in[['dt','solar','wind_onshore']]


In [None]:
f = go.Figure()
for j in ['solar','wind_onshore']:
    f.add_trace( go.Scatter( x = prd_df.dt , y = prd_df[j], name = j ))

f.update_layout( title= 'Solar & Wind Generation [MW]')
f.show()

In [None]:
prd_df[10:15]

In [None]:
plot_timeseries_with_granularity(prd_df , 'dt' , 'solar', 'month','year')

In [None]:
plot_timeseries_with_granularity(prd_df , 'dt' , 'solar', 'week','year')

In [None]:
monthly_solar_df = prd_df[['dt','solar']].set_index('dt').resample('MS').sum().reset_index()
monthly_solar_df = monthly_solar_df[monthly_solar_df.dt.dt.year >= 2020]

decomposed  = simple_seasonal_decompose( monthly_solar_df,
                                         ts_column= 'dt', 
                                         column = 'solar' ,
                                         model = 'multiplicative',
                                         freq = 12)

In [None]:
monthly_solar_df.head()

In [None]:
## Another Decomposition technique improves the shape of the decomposition. 

from statsmodels.tsa.seasonal import STL
stl = STL(monthly_solar_df.set_index('dt'), seasonal=13)  # Seasonal smoothing of 13 months (close to yearly cycle)
result = stl.fit()

# Plotting the components
fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, figsize=(10, 8), sharex=True)

# Original series
ax1.plot(result.observed)
ax1.set_ylabel('Observed')
ax1.set_title('STL Decomposition of Solar Generation Time Series')

# Trend component
ax2.plot(result.trend)
ax2.set_ylabel('Trend')

# Seasonal component
ax3.plot(result.seasonal)
ax3.set_ylabel('Seasonal')

# Residuals
ax4.plot(result.resid)
ax4.set_ylabel('Residuals')
ax4.set
