In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

In [1]:
# !wget -O pj_sample_value.csv https://www.dropbox.com/scl/fi/oeljyu7gqz1gddqpdf4l0/pj_sample_value.csv?rlkey=9m6zmi20wfvgi0u0ax467nn30&dl=0

path = '/Users/diana/Dropbox/_hackathon/deploy_2023/_data/pj_sample_value.csv'

In [3]:
# load dataset
df = pd.read_csv(path)
df.head(5)

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value
0,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,1,35.0
1,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,2,50.0
2,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,3,50.0
3,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,4,55.0
4,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,5,70.0


In [4]:
# identify unique classes per feature
for c in list(df.columns):
    print(c+"------------------------")
    print(df[c].unique())
    print('')

# define classes
circles = list(df.circle.unique())
kpis = list(df.kpi.unique())
periods = list(df.periodicity.unique())
ranges = list(df.range.unique())
years = list(df.period_year.unique())
months = list(df.period_month.unique())

circle------------------------
['HR' 'Programs - Children - Counceling' 'Programs - Parents -Online'
 'Fundraising' 'Digital' 'Programs']

kpi------------------------
['share of teams constituted as circles' 'share short tern leave'
 'involuntary headcount change (FTE)' 'reachability'
 'count sessions on .projuventute.ch' 'count leads' 'net promoter score'
 'private donations' 'additional monetization/savings from CRM'
 'additional monetization/savings from programs']

periodicity------------------------
['month' 'quarter' 'year']

range------------------------
['0 <= % <= 100' '0 <= X' '0 <= X <= 100']

period_year------------------------
[2023]

period_month------------------------
[ 1  2  3  4  5  6  7  8  9 10 11 12]

value------------------------
[3.500000e+01 5.000000e+01 5.500000e+01 7.000000e+01 8.000000e+01
 8.500000e+01 9.000000e+01          nan 2.040000e+00 2.200000e+00
 2.540000e+00 2.170000e+00 3.300000e+00 2.580000e+00 9.300000e-01
 2.260000e+00 9.800000e-01 1.470000e+00 

In [5]:
# identify nan values
df[df.value.isna()].head(5)

# drop nan values
df.dropna(inplace=True)

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value
8,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,9,
9,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,10,
10,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,11,
11,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,12,
20,HR,share short tern leave,month,0 <= % <= 100,2023,9,


In [7]:
# normalise values in relation to their kpi only
# Group by 'kpi' column
df_grouped = df.groupby('kpi')

# Lambda function for normalization with a conditional statement
normalize = lambda x: 1 if np.size(x) == 1 else (x - np.min(x)) / (np.max(x) - np.min(x))

# Normalize the values within each group
df['value_norm'] = df_grouped['value'].transform(normalize)
df.head(5)

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
0,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,1,35.0,0.0
1,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,2,50.0,0.272727
2,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,3,50.0,0.272727
3,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,4,55.0,0.363636
4,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,5,70.0,0.636364


## Overall relationships

In [8]:
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    label_list = []
    for cat in cat_cols:
        label_list_temp =  list(set(df[cat].values))
        label_list = label_list + label_list_temp

    # remove duplicates from labelList
    label_list = list(dict.fromkeys(label_list))

    # # generate color list
    # opacity = 0.8
    # color_list = []
    # for _ in range(len(label_list)):
    #   random_numbers = random.choices(range(0, 255+1), k=3)
    #   random_numbers.append(opacity)
    #   rgba_string = 'rgba' + '(' + ', '.join(map(str, random_numbers)) + ')'
    #   color_list.append(rgba_string)

    color_list = ['rgba(31, 119, 180, 0.8)',
      'rgba(255, 127, 14, 0.8)',
      'rgba(44, 160, 44, 0.8)',
      'rgba(214, 39, 40, 0.8)',
      'rgba(148, 103, 189, 0.8)',
      'rgba(140, 86, 75, 0.8)',
      'rgba(227, 119, 194, 0.8)',
      'rgba(127, 127, 127, 0.8)',
      'rgba(188, 189, 34, 0.8)',
      'rgba(23, 190, 207, 0.8)',
      'rgba(31, 119, 180, 0.8)',
      'rgba(255, 127, 14, 0.8)',
      'rgba(44, 160, 44, 0.8)',
      'rgba(214, 39, 40, 0.8)',
      'rgba(148, 103, 189, 0.8)',
      'rgba(140, 86, 75, 0.8)',
      'rgba(227, 119, 194, 0.8)',
      'rgba(127, 127, 127, 0.8)',
      'rgba(188, 189, 34, 0.8)',
      'rgba(23, 190, 207, 0.8)',
      'rgba(31, 119, 180, 0.8)',
      'rgba(255, 127, 14, 0.8)',
      'rgba(44, 160, 44, 0.8)',
      'rgba(214, 39, 40, 0.8)',
      'rgba(148, 103, 189, 0.8)',
      'rgba(140, 86, 75, 0.8)',
      'rgba(227, 119, 194, 0.8)',
      'rgba(127, 127, 127, 0.8)',
      'rgba(188, 189, 34, 0.8)',
      'rgba(23, 190, 207, 0.8)',
      'rgba(31, 119, 180, 0.8)',
      'rgba(255, 127, 14, 0.8)',
      'rgba(44, 160, 44, 0.8)',
      'rgba(214, 39, 40, 0.8)',
      'rgba(148, 103, 189, 0.8)',
      'rgba(255,0,255, 0.8)',
      'rgba(227, 119, 194, 0.8)',
      'rgba(127, 127, 127, 0.8)',
      'rgba(188, 189, 34, 0.8)',
      'rgba(23, 190, 207, 0.8)',
      'rgba(31, 119, 180, 0.8)',
      'rgba(255, 127, 14, 0.8)',
      'rgba(44, 160, 44, 0.8)',
      'rgba(214, 39, 40, 0.8)',
      'rgba(148, 103, 189, 0.8)',
      'rgba(140, 86, 75, 0.8)',
      'rgba(227, 119, 194, 0.8)',
      'rgba(127, 127, 127, 0.8)']

    # transform df into a source-target pair
    source_target_df = df[[cat_cols[0],cat_cols[0+1],value_cols]]
    source_target_df.columns = ['source','target','count']
    for i in range(1, len(cat_cols)-1):
        temp_df = df[[cat_cols[i],cat_cols[i+1],value_cols]]
        temp_df.columns = ['source','target','count']
        source_target_df = pd.concat([source_target_df,temp_df])
        source_target_df = source_target_df.groupby(['source','target']).agg({'count':'sum'}).reset_index()

    # add index for source-target pair
    source_target_df['sourceID'] = source_target_df['source'].apply(lambda x: label_list.index(x))
    source_target_df['targetID'] = source_target_df['target'].apply(lambda x: label_list.index(x))

    # creating the sankey diagram
    data = {
      'type': 'sankey',
      'domain': {'x': [0, 1], 'y': [0, 1]},
      'orientation': 'h',
      'valueformat': '.0f',
      'valuesuffix': 'X',
      'node': {'pad': 15,
        'thickness': 15,
        'line': {'color': 'black', 'width': 0.5},
        'label' : label_list,
        'color' : color_list},
      'link' : {
        'source' : list(source_target_df['sourceID']),
        'target' : list(source_target_df['targetID']),
        'value' : list(source_target_df['count'])
      }
    }

    layout={
      'title': {'text': "KPI"},
      'width': 1118,
      'height': 772,
      'font': {'size': 10},
      'updatemenus': [{'y': 1,
        'buttons': [{'label': 'Light',
          'method': 'relayout',
          'args': ['paper_bgcolor', 'white']},
        {'label': 'Dark',
          'method': 'relayout',
          'args': ['paper_bgcolor', 'black']}]},
      {'y': 0.9,
        'buttons': [{'label': 'Thick',
          'method': 'restyle',
          'args': ['node.thickness', 15]},
        {'label': 'Thin', 'method': 'restyle', 'args': ['node.thickness', 8]}]},
      {'y': 0.8,
        'buttons': [{'label': 'Small gap',
          'method': 'restyle',
          'args': ['node.pad', 15]},
        {'label': 'Large gap', 'method': 'restyle', 'args': ['node.pad', 20]}]},
      {'y': 0.7,
        'buttons': [{'label': 'Snap',
          'method': 'restyle',
          'args': ['arrangement', 'snap']},
        {'label': 'Perpendicular',
          'method': 'restyle',
          'args': ['arrangement', 'perpendicular']},
        {'label': 'Freeform',
          'method': 'restyle',
          'args': ['arrangement', 'freeform']},
        {'label': 'Fixed',
          'method': 'restyle',
          'args': ['arrangement', 'fixed']}]},
      {'y': 0.6,
        'buttons': [{'label': 'Horizontal',
          'method': 'restyle',
          'args': ['orientation', 'h']},
        {'label': 'Vertical',
          'method': 'restyle',
          'args': ['orientation', 'v']}]}]}

    fig = dict(data=[data], layout=layout)
    return fig

In [9]:
# @title
data = genSankey(df,cat_cols=['circle','kpi','periodicity','range'],value_cols='value',title='KPI project')

# override gray link colors with 'source' colors
opacity = 0.4
# change 'magenta' to its 'rgba' value to add opacity
#data['data'][0]['node']['color'] = ['rgba(255,0,255, 0.8)' if color == "magenta" else color for color in data['data'][0]['node']['color']]
data['data'][0]['link']['color'] = [data['data'][0]['node']['color'][src].replace("0.8", str(opacity))
                                    for src in data['data'][0]['link']['source']]

fig = go.Figure(data=[go.Sankey(
    valueformat = ".0f",
    valuesuffix = " total",
    # Define nodes
    node = dict(
      pad = 15,
      thickness = 15,
      line = dict(color = "black", width = 0.5),
      label =  data['data'][0]['node']['label'],
      color =  data['data'][0]['node']['color']
    ),
    # Add links
    link = dict(
      source =  data['data'][0]['link']['source'],
      target =  data['data'][0]['link']['target'],
      value =  data['data'][0]['link']['value'],
      # label =  data['data'][0]['link']['label'],
      color =  data['data'][0]['link']['color']
))])

fig.update_layout(title_text="Data Exploration KPI project",
                  font_size=10)
fig.show()

In [10]:
# @title
data = genSankey(df,cat_cols=['circle','kpi','range'],value_cols='value',title='KPI project')
data['data'][0]['link']['color'] = [data['data'][0]['node']['color'][src].replace("0.8", str(0.1))
                                    for src in data['data'][0]['link']['source']]

fig = go.Figure(data=[go.Sankey(
    valueformat = ".0f",
    valuesuffix = " total",
    # Define nodes
    node = dict(
      pad = 15,
      thickness = 15,
      line = dict(color = "black", width = 0.5),
      label =  data['data'][0]['node']['label'],
      color =  data['data'][0]['node']['color']
    ),
    # Add links
    link = dict(
      source =  data['data'][0]['link']['source'],
      target =  data['data'][0]['link']['target'],
      value =  data['data'][0]['link']['value'],
      # label =  data['data'][0]['link']['label'],
      color =  data['data'][0]['link']['color']
))])

fig.update_layout(title_text="Data Exploration KPI project",
                  font_size=10)
fig.show()

In [11]:
# @title
fig = px.sunburst(df, path=['circle', 'kpi','range'])#, values='value')
fig.update_layout(margin = dict(t=0, l=0, r=0, b=0))
fig.update_layout(uniformtext=dict(minsize=10, mode='hide'))
fig.show()

## By range

In [12]:
df[df.range=='0 <= % <= 100'].loc[:,'circle':'periodicity'].drop_duplicates()

Unnamed: 0,circle,kpi,periodicity
0,HR,share of teams constituted as circles,month
12,HR,share short tern leave,month
24,HR,involuntary headcount change (FTE),month
36,Programs - Children - Counceling,reachability,quarter


In [13]:
df[df.range=='0 <= X <= 100'].loc[:,'circle':'periodicity'].drop_duplicates()

Unnamed: 0,circle,kpi,periodicity
64,Programs - Parents -Online,net promoter score,year


In [14]:
df[df.range=='0 <= X'].loc[:,'circle':'periodicity'].drop_duplicates()

Unnamed: 0,circle,kpi,periodicity
40,Programs - Parents -Online,count sessions on .projuventute.ch,month
54,Programs - Parents -Online,count leads,month
65,Fundraising,private donations,month
77,Digital,additional monetization/savings from CRM,quarter
81,Programs,additional monetization/savings from programs,quarter


## By circle

In [15]:
df[df.circle=='HR']

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
0,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,1,35.0,0.0
1,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,2,50.0,0.272727
2,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,3,50.0,0.272727
3,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,4,55.0,0.363636
4,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,5,70.0,0.636364
5,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,6,80.0,0.818182
6,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,7,85.0,0.909091
7,HR,share of teams constituted as circles,month,0 <= % <= 100,2023,8,90.0,1.0
12,HR,share short tern leave,month,0 <= % <= 100,2023,1,2.04,0.468354
13,HR,share short tern leave,month,0 <= % <= 100,2023,2,2.2,0.535865


In [16]:
df[df.circle=='Programs - Children - Counceling']

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
36,Programs - Children - Counceling,reachability,quarter,0 <= % <= 100,2023,3,46.0,1.0
37,Programs - Children - Counceling,reachability,quarter,0 <= % <= 100,2023,6,44.9,0.541667
38,Programs - Children - Counceling,reachability,quarter,0 <= % <= 100,2023,9,43.6,0.0


In [17]:
df[df.circle=='Programs - Parents -Online']

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
40,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,1,158611.0,0.0
41,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,2,203755.0,1.0
42,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,3,190560.0,0.707713
43,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,4,164663.0,0.13406
44,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,5,180421.0,0.483121
45,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,6,174107.0,0.343257
46,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,7,164059.0,0.12068
47,Programs - Parents -Online,count sessions on .projuventute.ch,month,0 <= X,2023,8,194407.0,0.792929
54,Programs - Parents -Online,count leads,month,0 <= X,2023,3,825.0,1.0
57,Programs - Parents -Online,count leads,month,0 <= X,2023,6,276.0,0.0


In [18]:
df[df.circle=='Fundraising']

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
65,Fundraising,private donations,month,0 <= X,2023,1,1369218.0,0.257017
66,Fundraising,private donations,month,0 <= X,2023,2,2144446.0,1.0
67,Fundraising,private donations,month,0 <= X,2023,3,1923875.0,0.788603
68,Fundraising,private donations,month,0 <= X,2023,4,1274911.0,0.166632
69,Fundraising,private donations,month,0 <= X,2023,5,1797114.0,0.667115
70,Fundraising,private donations,month,0 <= X,2023,6,1352463.0,0.240959
71,Fundraising,private donations,month,0 <= X,2023,7,1359887.0,0.248074
72,Fundraising,private donations,month,0 <= X,2023,8,1101047.0,0.0


In [19]:
df[df.circle=='Digital']

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
77,Digital,additional monetization/savings from CRM,quarter,0 <= X,2023,3,51000.0,1.0
78,Digital,additional monetization/savings from CRM,quarter,0 <= X,2023,6,0.0,0.0
79,Digital,additional monetization/savings from CRM,quarter,0 <= X,2023,9,0.0,0.0


In [20]:
df[df.circle=='Programs']

Unnamed: 0,circle,kpi,periodicity,range,period_year,period_month,value,value_norm
81,Programs,additional monetization/savings from programs,quarter,0 <= X,2023,3,186258.0,1.0
82,Programs,additional monetization/savings from programs,quarter,0 <= X,2023,6,0.0,0.0
83,Programs,additional monetization/savings from programs,quarter,0 <= X,2023,9,26000.0,0.139591


In [21]:
# @title
fig = make_subplots(rows=2,cols=3, shared_xaxes=True)
for c in circles:
  temp_df = df[(df.circle==c)]
  fig = px.line(temp_df, x="period_month", y="value",
                color='kpi',title='Circle '+c)

  fig.update_xaxes(range=[1,12])
  fig.update_layout(width=800, height=600)
  fig.show()


## By KPI

In [22]:
fig = px.area(df[(df.periodicity!='year')&(df.period_month<9)],
              x="period_month", y="value_norm", color="circle", line_group="kpi")
fig.show()

In [23]:
# @title
# Create a subplot grid
rows = 2
cols = 5
fig = make_subplots(rows=rows, cols=cols)

# Create box plots for each subplot
for i in range(rows):
    for j in range(cols):
        # Calculate the index for the current subplot
        index = i * cols + j

        # Add the box plot trace to the subplot
        fig.add_trace(
            go.Box(y=df[df.kpi==kpis[index]].value_norm, name=kpis[index]),
            row=i+1, col=j+1
        )

# Update the layout of the figure
fig.update_layout(
    title="Distribution of normalized KPI values",
    height=1000,
    width=1200,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=-0.2,
        xanchor="center",
        x=0.5,
        font=dict(size=10),
        bgcolor="rgba(0,0,0,0)"
    )
)

# Show the figure
fig.show()

In [25]:
df_transposed = pd.DataFrame()
for i in range(1,13):
  temp_df = df[df.period_month==i][['kpi','value']].T
  new_header = temp_df.iloc[0] #grab the first row for the header
  temp_df = temp_df[1:] #take the data less the header row
  temp_df.columns = new_header #set the header row as the df header
  temp_df.index = [i]
  df_transposed = pd.concat([df_transposed,temp_df])

df_transposed.head(12)

kpi,share of teams constituted as circles,share short tern leave,involuntary headcount change (FTE),count sessions on .projuventute.ch,private donations,reachability,count leads,additional monetization/savings from CRM,additional monetization/savings from programs,net promoter score
1,35.0,2.04,2.26,158611.0,1369218.0,,,,,
2,50.0,2.2,0.98,203755.0,2144446.0,,,,,
3,50.0,2.54,1.47,190560.0,1923875.0,46.0,825.0,51000.0,186258.0,
4,55.0,2.17,1.46,164663.0,1274911.0,,,,,
5,70.0,3.3,0.0,180421.0,1797114.0,,,,,
6,80.0,2.58,0.0,174107.0,1352463.0,44.9,276.0,0.0,0.0,
7,85.0,2.04,0.51,164059.0,1359887.0,,,,,
8,90.0,0.93,1.01,194407.0,1101047.0,,,,,
9,,,,,,43.6,,0.0,26000.0,
10,,,,,,,,,,


In [26]:
# Create a subplot grid
rows = 3
cols = 3
fig = make_subplots(rows=rows, cols=cols)

# Create box plots for each subplot
for i in range(rows):
    for j in range(cols):
        # Calculate the index for the current subplot
        index = i * cols + j

        # Add the box plot trace to the subplot
        fig.add_trace(
            go.Scatter(x=df[df.kpi==kpis[index]]['period_month'],
                       y=df[df.kpi==kpis[index]]['value_norm'],
                       mode='lines',name=kpis[index]),
            row=i+1, col=j+1
        )

# Update the layout
fig.update_layout(
    title="Value lineplots per kpi",
    height=1000,
    width=1200,
    showlegend=True,
    legend=dict(
    orientation="h",
    yanchor="bottom",
    y=-0.2,
    xanchor="center",
    x=0.5,
    font=dict(size=10),
    bgcolor="rgba(0,0,0,0)"
  )
)

for i in range(1, rows*cols+1):
  fig['layout'][f'xaxis{i}'].update(matches='x')


# Show the plot
fig.show()