In [265]:
# !pip install ctgan

In [266]:
# from ctgan import CTGAN
import pandas as pd

import plotly.graph_objs as go
from plotly.subplots import make_subplots

import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
    
def plotTimeSeries(time, values, label):
    """
    Plot time series data
    
    Args:
        time (list): Time values
        values (list): Time series values
        label (str): Label for the plot
        
    Returns:
        None
    """
    # Create figure with secondary y-axis
    
    # Create a subplot
    fig = make_subplots(rows=1, cols=1)

    # Add a trace for the time series data
    trace = go.Scatter(x=time, y=values, mode='lines', name='Value')
    fig.add_trace(trace)

    # Update layout for labels and title
    fig.update_layout(
        title=label,
        xaxis_title="Time",
        yaxis_title="Value",
        template="plotly",
        margin=dict(l=0, r=0, t=30, b=0)
    )

    # Show plot
    fig.show()
    
def timeSeries(numMonths=12,numYears=10,
                trend=1,event=1, #1 for positive effect and -1 for negative
                trendSteepness = 3, #between 1 and 10
                seasonalityIntensity=0.3, #between 0 and 1
                noiseIntensity=70, #between 0 and 100
                whiteNoiseIntensity=70, #between 0 and 100
                eventIntensity=40, #between 0 and 100
                verbose=False,
                ):
    """
    Create a time series with multiple patterns
    
    Args:
        numMonths: Number of months in the time series
        numYears: Number of years in the time series
        trend: Trend of the time series
        event: Event of the time series, like COVID or protest
        trendSteepness: Steepness of the trend
        seasonalityIntensity: Intensity of the seasonality
        noiseIntensity: Intensity of the noise
        whiteNoiseIntensity: Intensity of the white noise
        eventIntensity: Intensity of the event
        verbose: Whether to print the time series
        
    Returns:
        pandas.Series: The time series
    """
    # SEASONALITY
    # series that repeats the same pattern
    timePoints = numMonths*numYears
    timeSeasonal = np.arange(timePoints)
    scale = 10 #define the range of the generated values
    values = np.where(timeSeasonal < numMonths, (scale**3), (scale-9)**2)
    
    # Repeat the pattern 10 times for 10 years
    seasonal = []
    for i in range(numYears):
        for j in range(numMonths):
            seasonal.append(values[j]*seasonalityIntensity)
    if verbose:
        plotTimeSeries(timeSeasonal, seasonal, label="Seasonality")

    # ==============================================================================
    # ADD NOISE
    noise = np.random.randn(timePoints)*noiseIntensity
    seasonal += noise
    if verbose:
        plotTimeSeries(timeSeasonal, seasonal, label="Seasonality with Noise")

    # ==============================================================================
    # MULTIPLE PATTERNS
    #The following time series contain both an upward trend and seasonality.
    #There is also some noise
    seasonalTrend = seasonal + np.arange(timePoints)*trendSteepness
    seasonalTrend = seasonalTrend*trend
    if verbose:
        plotTimeSeries(timeSeasonal, seasonalTrend, label="Seasonality + Upward Trend + Noise")

    # ==============================================================================
    # WHITE NOISE
    values = np.random.randn(timePoints)*whiteNoiseIntensity
    if verbose:
        plotTimeSeries(timeSeasonal, values, label="White Noise")

    # ==============================================================================
    # NON-STATIONARY TIME SERIES
    bigEvent = np.zeros(timePoints)
    #COVID: good for upward trend. Projuventute helps commnities and has online presence
    bigEvent[-36:-24] = np.arange(numMonths)*eventIntensity*event
    nonStationary = (seasonalTrend + bigEvent)
    plotTimeSeries(timeSeasonal, nonStationary, label="Non-stationary Time Series")

    return nonStationary

def smoothTimeSeries(data, windowSize):
    """
    Smooths the variability of a time series while preserving the trend and overall structure.

    Args:
        data (pandas.Series): The time series data.
        windowSize (int): The size of the moving average window.

    Returns:
        pandas.Series: The smoothed time series.
    """
    # Create a copy of the original data
    smoothedData = data.copy()

    # Apply moving average smoothing
    smoothedData = smoothedData.rolling(window=windowSize, center=True).mean()

    # Interpolate missing values at the edges
    smoothedData = smoothedData.interpolate()
    return smoothedData


# !wget -O pj_sample_value_wide_filled_2023.csv https://www.dropbox.com/scl/fi/drv20mwc9zi1zq5v22ckh/pj_sample_value_wide_filled_2023.csv?rlkey=bcankkw7ff32zes221j8rxzgr&dl=0

# Generate synthetic data with CTGAN 

In [267]:
# # DO NOT RUN
# # import filled dataset for 2023
# df = pd.read_csv("pj_sample_value_wide_filled_2023.csv")

# # Extract target data types
# categoricals = df.select_dtypes(exclude="number").columns.tolist()

# # Fit CTGAN
# ctgan = CTGAN(epochs=10)
# ctgan.fit(df, categoricals)

# # Generate the data
# dfSynthetic = ctgan.sample(2000)

# # export synthetic data
# dfSynthetic.to_csv("/content/ctgan_toy_dataset_10years.csv", index=False)

# Process synthetic dataset

In [268]:
# import synthetic data and  DataFrame to fill with values
dfToy = pd.read_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/ctgan_toy_dataset_2014_2023.csv')
dfWide2023 = pd.read_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/pj_sample_value_wide_filled_2023.csv')
dfAllYears = pd.read_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/pj_sample_value_long_notfilled_2014_2023.csv')
if 'Unnamed: 0' in dfAllYears.columns:
    dfAllYears.drop(columns=['Unnamed: 0'],inplace=True)

dfToyProcessed = pd.DataFrame()
dates = dfToy.sort_values('date', ascending=True).date.unique().tolist()

# change dates as CTGAN repeated the same dates for the synthetic data
for d in dates:
    tempList = []
    for i in range(2014,2023):
        tempList.append(d.replace('2023',str(i)))
    # select 10 samples of each date
    dfTemp = dfToy.loc[dfToy.date==d].sample(9, random_state=1)
    dfTemp['date'] = tempList
    dfToyProcessed = pd.concat([dfToyProcessed,dfTemp])

# sort by date
dfToyProcessed.sort_values('date', ascending=True, inplace=True)
dfToyProcessed.reset_index(inplace=True,drop=True)

# get rid of negative values produced by CTGAN
dfToyProcessed = dfToyProcessed.set_index('date',drop=True).abs().round(2)

# export synthetic data cleaned
dfToyProcessed.to_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/ctgan_toy_dataset_2014_2023_processed.csv')
dfAllYears.head(10)

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,date,value
0,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,1,2014-01-01,51000.0
1,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,2,2014-02-01,51000.0
2,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,51000.0
3,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,4,2014-04-01,0.0
4,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,5,2014-05-01,0.0
5,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,0.0
6,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,7,2014-07-01,0.0
7,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,8,2014-08-01,0.0
8,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,9,2014-09-01,0.0
9,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,10,2014-10-01,17000.0


In [269]:
dfToyProcessed

Unnamed: 0_level_0,share of teams constituted as circles,share short tern leave,involuntary headcount change (FTE),reachability,count sessions on .projuventute.ch,count leads,net promoter score,private donations,additional monetization/savings from CRM,additional monetization/savings from programs
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-01-01,116.99,3.02,1.92,44.40,162857.14,226.58,35.0,1263671.34,3909.12,48318.73
2014-02-01,130.40,2.12,2.63,42.40,189084.12,315.47,35.0,587662.81,27773.25,183751.30
2014-03-01,129.68,1.18,1.13,42.19,182746.75,271.14,35.0,887369.03,24514.12,252685.97
2014-04-01,80.83,0.39,2.54,44.96,217150.50,936.54,35.0,972077.04,1632.79,106050.46
2014-05-01,61.88,1.37,2.46,44.64,168749.43,411.36,35.0,910949.82,65369.21,169507.84
...,...,...,...,...,...,...,...,...,...,...
2022-08-01,116.20,4.26,2.27,42.43,178699.46,737.49,35.0,1001653.86,20576.48,260766.66
2022-09-01,137.46,2.44,1.80,43.44,182089.41,777.24,35.0,521897.07,16816.78,14621.48
2022-10-01,53.69,2.00,1.94,42.80,181204.92,415.08,35.0,1419935.68,15141.87,80060.57
2022-11-01,96.57,0.71,2.35,43.41,196062.38,73.34,35.0,1387686.90,3680.96,278931.42


In [270]:
# Creating subplots
fig = make_subplots(rows=5, cols=2, subplot_titles=dfToyProcessed.columns[1:].tolist())

row = 1
col = 1
for column in dfToyProcessed.columns[1:]:
    trace = go.Histogram(x=dfToyProcessed[column], name=column, nbinsx=50, opacity=0.6)
    fig.add_trace(trace, row=row, col=col)
    col += 1
    if col > 2:
        col = 1
        row += 1

fig.update_layout(height=1200, width=1000, title_text='Distribution of Each Column in the Dataset')
fig.show()

# Compare statisticall behaviour between actual data and synthetic data

In [271]:
# check stats on toy dataset
dfToyProcessed.drop(columns=['net promoter score',
                             'private donations',
                             'share of teams constituted as circles']).describe()

Unnamed: 0,share short tern leave,involuntary headcount change (FTE),reachability,count sessions on .projuventute.ch,count leads,additional monetization/savings from CRM,additional monetization/savings from programs
count,108.0,108.0,108.0,108.0,108.0,108.0,108.0
mean,1.505926,1.978148,44.273426,196304.943241,473.924352,19818.215463,115701.212685
std,1.014453,0.746665,1.19206,18215.342345,246.926143,18622.909995,93065.04346
min,0.01,0.01,41.95,147095.3,68.25,228.56,973.97
25%,0.6875,1.6225,43.4025,186611.98,284.6925,7664.455,46050.8925
50%,1.36,2.095,44.355,200658.66,437.705,14379.22,82655.375
75%,2.1575,2.545,45.0925,210691.3225,640.7275,24588.1275,170256.81
max,4.26,3.27,47.44,226323.75,1068.92,87600.48,346565.14


In [272]:
# check stats on actual dataset
dfWide2023.drop(columns=['net promoter score',
                         'private donations',
                         'share of teams constituted as circles']).describe()

Unnamed: 0,share short tern leave,involuntary headcount change (FTE),reachability,count sessions on .projuventute.ch,count leads,additional monetization/savings from CRM,additional monetization/savings from programs
count,12.0,12.0,12.0,12.0,12.0,12.0,12.0
mean,1.7775,0.885,44.8325,179728.563333,575.916667,17000.0,70752.6675
std,0.850979,0.681262,0.887285,13847.495742,157.015609,21746.473068,74498.730391
min,0.8,0.0,43.6,158611.0,337.0,0.0,0.0
25%,0.93,0.3825,44.5225,171746.0,467.0,0.0,19500.0
50%,2.04,0.97,44.865,178822.88,576.5,8500.0,48376.335
75%,2.285,1.1225,45.175,191521.75,701.0,25500.0,99629.0025
max,3.3,2.26,46.0,203755.0,771.0,51000.0,186258.0


In [273]:
# check the difference between toy and actual
dfToyProcessed.drop(columns=['net promoter score',
                             'private donations',
                             'share of teams constituted as circles']).describe()- dfWide2023.drop(columns=['net promoter score',
                                                                                                             'private donations',
                                                                                                             'share of teams constituted as circles']).describe()

Unnamed: 0,share short tern leave,involuntary headcount change (FTE),reachability,count sessions on .projuventute.ch,count leads,additional monetization/savings from CRM,additional monetization/savings from programs
count,96.0,96.0,96.0,96.0,96.0,96.0,96.0
mean,-0.271574,1.093148,-0.559074,16576.379907,-101.992315,2818.215463,44948.545185
std,0.163473,0.065403,0.304775,4367.846603,89.910534,-3123.563073,18566.313069
min,-0.79,0.01,-1.65,-11515.7,-268.75,228.56,973.97
25%,-0.2425,1.24,-1.12,14865.98,-182.3075,7664.455,26550.8925
50%,-0.68,1.125,-0.51,21835.78,-138.795,5879.22,34279.04
75%,-0.1275,1.4225,-0.0825,19169.5725,-60.2725,-911.8725,70627.8075
max,0.96,1.01,1.44,22568.75,297.92,36600.48,160307.14


In [274]:

# transform from wide to long format
kpis = dfAllYears.kpi.unique().tolist()

# items to be removed because already completed
remove = {'net promoter score','private donations'}
kpisToTransfer = [e for e in kpis if e not in remove]

# transfer wide toy df to long df with actual data
for y in range(2014,2023):
    for k in kpisToTransfer:
        dfAllYears.loc[(dfAllYears.period_year==y)
                    &(dfAllYears.kpi==k),'value'] = dfToyProcessed.loc[str(y)+'-01-01':str(y)+'-12-01',k].values

# replicate entries for quarterly data
for i in [3,6,9,12]:
    temp = dfAllYears.loc[(dfAllYears['periodicity'] == 'quarter')&(dfAllYears.period_month==i)].values.tolist()
    dfAllYears.loc[(dfAllYears['periodicity'] == 'quarter')&(dfAllYears.period_month==i-1)]= temp
    dfAllYears.loc[(dfAllYears['periodicity'] == 'quarter')&(dfAllYears.period_month==i-2)]= temp
    
# Limit HR KPI share of teams constituted as circles to 2023 (new kpi)
dfAllYears.drop(dfAllYears.loc[(dfAllYears.period_year<2023)&
               (dfAllYears.kpi=='share of teams constituted as circles')].index, inplace=True)
    
# export long format
dfAllYears.to_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/pj_sample_value_long_completed_2014_2023.csv')
dfAllYears

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,date,value
0,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
1,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
2,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
3,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
4,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
...,...,...,...,...,...,...,...,...
1195,HR,share short tern leave,month,0 <= % <= 100,2023,8,2023-08-01,0.93
1196,HR,share short tern leave,month,0 <= % <= 100,2023,9,2023-09-01,0.93
1197,HR,share short tern leave,month,0 <= % <= 100,2023,10,2023-10-01,0.80
1198,HR,share short tern leave,month,0 <= % <= 100,2023,11,2023-11-01,0.80


In [275]:
# Creating subplots for the time series of each KPI 
fig = make_subplots(rows=5, cols=2, subplot_titles=kpis, shared_xaxes=False)

row = 1
col = 1
for kpi in kpis:
    dfKpi = dfAllYears[dfAllYears['kpi'] == kpi]
    trace = go.Scatter(x=dfKpi['date'], y=dfKpi['value'], name=kpi, mode='lines')
    fig.add_trace(trace, row=row, col=col)
    col += 1
    if col > 2:
        col = 1
        row += 1

fig.update_layout(height=1200, width=1000, title_text='Time Series of Each KPI')
fig.show()

# Add timeseries trend, pattern and noise to synthetic data

In [276]:
# # commented code, do not use
# # SEASONALITY
# # series that repeats the same pattern
# timeSeasonal = np.arange(120)
# scale = 10
# values = np.where(timeSeasonal < 12, (scale**3), (scale-9)**2)
# # Repeat the pattern 10 times for 10 years
# seasonal = []
# for i in range(10):
#     for j in range(12):
#         seasonal.append(values[j]*0.25)
# plotTimeSeries(timeSeasonal, seasonal, label="Seasonality")

# # ==============================================================================
# # ADD NOISE
# noise = np.random.randn(120)*50
# seasonal += noise
# plotTimeSeries(timeSeasonal, seasonal, label="Seasonality with Noise")

# # ==============================================================================
# # MULTIPLE PATTERNS
# #The following time series contain both an upward trend and seasonality.
# #There is also some noise
# seasonal_upward = seasonal + np.arange(120)*5
# plotTimeSeries(timeSeasonal, seasonal_upward, label="Seasonality + Upward Trend + Noise")

# # ==============================================================================
# # WHITE NOISE
# values = np.random.randn(120)*100
# plotTimeSeries(timeSeasonal, values, label="White Noise")

# # ==============================================================================
# # NON-STATIONARY TIME SERIES
# bigEvent = np.zeros(120)
# #COVID: good for upward trend. Projuventute helps commnities and has online presence
# bigEvent[-36:-24] = np.arange(12)*75 
# nonStationary = (seasonal_upward + bigEvent)*0.05
# plotTimeSeries(timeSeasonal, nonStationary, label="Non-stationary Time Series")


In [277]:
# define kpi to apply time series pattern to: they should be monthly
# exclude donations, and share of teams constituted as circles
kpisTimeSeries = list(dfAllYears[dfAllYears.periodicity=='month'].kpi.unique())
remove = {'private donations','share of teams constituted as circles'}
kpisTimeSeries = [e for e in kpisTimeSeries if e not in remove]

# create nonStationary time series pattern, taking into account COVID
mms = MinMaxScaler((0.5,1))
nonStationary = timeSeries(numMonths=12,numYears=10,trend=1,event=1)
timeSeriesPattern = mms.fit_transform(nonStationary.reshape(-1, 1)).flatten()

# apply time series pattern to time series (product)
for k in kpisTimeSeries:
    dfAllYears.loc[(dfAllYears.kpi==k)&
                   (dfAllYears.period_year<2023),
                   'value'] = dfAllYears.loc[(dfAllYears.kpi==k)&
                                             (dfAllYears.period_year<2023),
                                             'value']*timeSeriesPattern[:120-12]
                   

dfAllYears.head(10)

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,date,value
0,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
1,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
2,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
3,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
4,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
5,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
6,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,9,2014-09-01,14161.84
7,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,9,2014-09-01,14161.84
8,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,9,2014-09-01,14161.84
9,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,12,2014-12-01,17558.45


In [278]:
# Creating subplots for the time series of each KPI 
fig = make_subplots(rows=5, cols=2, subplot_titles=kpis, shared_xaxes=False)

row = 1
col = 1
for kpi in kpis:
    dfKpi = dfAllYears[dfAllYears['kpi'] == kpi]
    trace = go.Scatter(x=dfKpi['date'], y=dfKpi['value'], name=kpi, mode='lines')
    fig.add_trace(trace, row=row, col=col)
    col += 1
    if col > 2:
        col = 1
        row += 1

fig.update_layout(height=1200, width=1000, title_text='Time Series of Each KPI')
fig.show()


# Smoothen final time series using moving average (less noisy)

In [279]:
# Smooth the time series with a window size of 5
dfAllYearsSmooth = dfAllYears.copy()
dfAllYearsSmooth.sort_values('date',inplace=True)

# apply smoothing to all kpis except these two
for kpi in kpis:
    if kpi == 'private donations' or kpi == 'share of teams constituted as circles':
        pass
    else:
        dfKpiSmooth = smoothTimeSeries(
            dfAllYears.loc[
                (dfAllYears.kpi==kpi)&
                (dfAllYears.periodicity=='month')].sort_values('date')['value'],windowSize=5).tolist()
        
        dfAllYearsSmooth.loc[
            (dfAllYearsSmooth.kpi==kpi)&
            (dfAllYearsSmooth.periodicity=='month'),'value'] = dfKpiSmooth
    
# reorder columns
dfAllYearsSmooth.sort_values(['circle','kpi','date'],inplace=True)
dfAllYearsSmooth.reset_index(inplace=True,drop=True)
dfAllYearsSmooth

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,date,value
0,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
1,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
2,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,3,2014-03-01,24514.12
3,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
4,Digital,additional monetization/savings from CRM,quarter,0 <= X,2014,6,2014-06-01,23346.09
...,...,...,...,...,...,...,...,...
1087,Programs - Parents -Online,net promoter score,year,-100 <= X <= 100,2023,8,2023-08-01,35.00
1088,Programs - Parents -Online,net promoter score,year,-100 <= X <= 100,2023,9,2023-09-01,35.00
1089,Programs - Parents -Online,net promoter score,year,-100 <= X <= 100,2023,10,2023-10-01,35.00
1090,Programs - Parents -Online,net promoter score,year,-100 <= X <= 100,2023,11,2023-11-01,35.00


In [280]:
# Creating subplots for the time series of each KPI (smoothed)
fig = make_subplots(rows=5, cols=2, subplot_titles=kpis, shared_xaxes=False)

row = 1
col = 1
for kpi in kpis:
    dfKpi = dfAllYearsSmooth[dfAllYearsSmooth['kpi'] == kpi]
    trace = go.Scatter(x=dfKpi['date'], y=dfKpi['value'], name=kpi, mode='lines')
    fig.add_trace(trace, row=row, col=col)
    col += 1
    if col > 2:
        col = 1
        row += 1

fig.update_layout(height=1200, width=1000, title_text='Time Series of Each KPI')
fig.show()


In [283]:
# check for missing values caused by moving average. Replace nans if existent
if dfAllYearsSmooth[dfAllYearsSmooth.value.isna()]['period_year'].empty == False:
    dfNan = dfAllYearsSmooth.loc[dfAllYearsSmooth.value.isna(),'circle':'date']

    # retrieve values that the moving average cancelled out 
    for i in range(len(dfNan)):
        tempDict = dict(dfNan.iloc[i])
        val = dfAllYears.loc[
            (dfAllYears.circle == tempDict['circle'])&
            (dfAllYears.kpi == tempDict['kpi'])& 
            (dfAllYears.date == tempDict['date']),'value'].values[0]
        
        dfAllYearsSmooth.loc[
            (dfAllYearsSmooth.circle == tempDict['circle'])&
            (dfAllYearsSmooth.kpi == tempDict['kpi'])& 
            (dfAllYearsSmooth.date == tempDict['date']),'value'] = val
        
print(dfAllYearsSmooth.loc[dfAllYearsSmooth.value.isna(),'circle':].empty == True)

True


In [284]:
# Creating subplots for the time series of each KPI (smoothed)
fig = make_subplots(rows=5, cols=2, subplot_titles=kpis, shared_xaxes=False)

row = 1
col = 1
for kpi in kpis:
    dfKpi = dfAllYearsSmooth[dfAllYearsSmooth['kpi'] == kpi]
    trace = go.Scatter(x=dfKpi['date'], y=dfKpi['value'], name=kpi, mode='lines')
    fig.add_trace(trace, row=row, col=col)
    col += 1
    if col > 2:
        col = 1
        row += 1

fig.update_layout(height=1200, width=1000, title_text='Time Series of Each KPI')
fig.show()

In [285]:
# export long format
dfAllYearsSmooth.to_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/pj_sample_value_long_completed_timeSeries_smooth_2014_2023.csv')
dfAllYears.to_csv('/Users/diana/Dropbox/_hackathon/deploy_2023/_data/pj_sample_value_long_completed_timeSeries_2014_2023.csv')

In [288]:
dfAllYearsSmooth.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1092 entries, 0 to 1091
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   circle        1092 non-null   object 
 1   kpi           1092 non-null   object 
 2   periodicity   1092 non-null   object 
 3   range         1092 non-null   object 
 4   period_year   1092 non-null   int64  
 5   period_month  1092 non-null   int64  
 6   date          1092 non-null   object 
 7   value         1092 non-null   float64
dtypes: float64(1), int64(2), object(5)
memory usage: 68.4+ KB


In [None]:
codeSnippetJS = """
                // Initialize the echarts instance based on the prepared dom
                var myChart = echarts.init(document.getElementById('main'));

                // Apply Montserrat font to ECharts container
                document.getElementById('main').style.fontFamily ='Montserrat, sans-serif';

                // Specify the configuration items and data for the chart
                // (you should replace this with actual grouped data)

                // Prepare data
                var data = """ +"{name: '"+k.capitalize()+"', type: 'bar', stack: 'total', data: "+str(
                        values)+", itemStyle: {color: '"+palette[i]+"'}}"+ """;

                // Configure chart options
                var option = {
                    title: {
                        text: '"""+k.capitalize()+""" for Circle """+circle.capitalize()+""", Year """+str(year)+ """',
                        textStyle: {
                            fontFamily: 'Montserrat',
                            fontSize: 14 
                        }
                    },
                    tooltip: {
                        trigger: 'axis',
                        formatter: function (params) {
                            return params[0].axisValueLabel + '<br/>' +
                                params.map(function (item) {
                                    return item.marker + ' ' + item.seriesName + ': ' + item.data;
                                }).join('<br/>');
                        },
                        textStyle: {
                            fontFamily: 'Montserrat',
                            fontSize: 12
                        }
                    },
                    legend: {
                        data: ['"""+kpis[i]+"""'], //KPIs
                        bottom: 0,
                        textStyle: {
                            fontFamily: 'Montserrat'
                        }
                    },
                    xAxis: {
                        type: 'category',
                        boundaryGap: true,
                        data: """+str(monthsRange)+""", //MONTHS
                        axisLabel: {
                            textStyle: {
                                fontFamily: 'Montserrat'
                            }
                        }
                    },
                    yAxis: {
                        type: 'value',
                        axisLabel: {
                            textStyle: {
                                fontFamily: 'Montserrat'
                            }
                        }
                    },
                    series: data
                };

                // Add markLine to one of the series
                data[0].markLine = {
                    silent: true,
                    data: [
                        {yAxis: """+str(target)+""", lineStyle: {color: '#FF0000', width: 1.5, type: 'dotted'},
                                    label: {
                                            show: true,
                                            position: 'end', // position of the label, can be 'start', 'middle', or 'end'
                                            formatter: 'Target', // label text
                                            color: '#000000' // text color
                                    }
                        },
                        {yAxis: """+str(baseline)+""", lineStyle: {color: '#FF0000', width: 1.5, type: 'dotted'},
                                    label: {
                                            show: false,
                                            position: 'end', // position of the label, can be 'start', 'middle', or 'end'
                                            formatter: 'Baseline', // label text
                                            color: '#000000' // text color
                                    }
                        }
                    ],
                    symbol: 'none', // This will remove the arrow at the ends of the line
                    label: {
                        show: false // Set to true to display the value label near the line
                    }
                };

                // Display the chart using the configuration items and data just specified.
                myChart.setOption(option);
                """