In [None]:
import logging
import math
import os
import sys
from pathlib import Path

import tomli
import numpy as np
import structlog

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context('poster')
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style('whitegrid')

import pandas as pd
pd.set_option('display.max_rows', 120)
pd.set_option('display.max_columns', 120)

In [None]:
# Set the logging level
logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [None]:
import pytanis
from pytanis import GSheetsClient, PretalxClient
from pytanis.pretalx import subs_as_df, reviews_as_df, speakers_as_df

In [None]:
# Be aware that this notebook might only run with the following version
pytanis.__version__

In [None]:
# Import event-specific settings to don't have them here in the notebook
with open('config.toml', 'rb') as fh:
    cfg = tomli.load(fh)

In [None]:
pretalx_client = PretalxClient(blocking=True)
subs_count, subs = pretalx_client.submissions(cfg['event_name'], params={'questions': 'all'})
spkrs_count, spkrs = pretalx_client.speakers(cfg['event_name'], params={'questions': 'all'})
revs_count, revs = pretalx_client.reviews(cfg['event_name'])
subs, revs, spkrs = list(subs), list(revs), list(spkrs)

In [None]:
subs_df = subs_as_df(subs, with_questions=True)
revs_df = reviews_as_df(revs)
spkrs_df = speakers_as_df(spkrs, with_questions=True)

In [None]:
subs_df.head(2)

In [None]:
# filter for all submitted talks
talks_df = subs_df.loc[subs_df['State'] == 'submitted']

In [None]:
main_tracks = ['PyData', 'PyCon', 'General']
all_tracks = ['PyCon: MLOps & DevOps', 'PyCon: Programming & Software Engineering', 'PyCon: Python Language & Ecosystem', 'PyCon: Security', 'PyCon: Testing', 'PyCon: Django & Web', 'PyData: Data Handling & Data Engineering', 'PyData: Machine Learning & Deep Learning & Statistics', 'PyData: Natural Language Processing & Audio (incl. Generative AI NLP)', 'PyData: Computer Vision (incl. Generative AI CV)', 'PyData: Generative AI', 'PyData: Embedded Systems & Robotics', 'PyData: PyData & Scientific Libraries Stack', 'PyData: Visualisation & Jupyter', 'PyData: Research Software Engineering', 'General: Community & Diversity', 'General: Education, Career & Life', 'General: Ethics & Privacy', 'General: Infrastructure - Hardware & Cloud', 'General: Others']

# all available submission types
submission_types = talks_df['Submission type'].unique()

# all available expertise levels
expertise_levels = list(talks_df['Q: Expected audience expertise: Domain'].unique()) + list(talks_df['Q: Expected audience expertise: Python'].unique())
expertise_levels = list(set(expertise_levels))

# all expertise categories
expertise_categories = ['Q: Expected audience expertise: Python', 'Q: Expected audience expertise: Domain']

# create an dataframe with 'all_tracks' and all 'submission_types' as rows
tracks_df = pd.DataFrame(all_tracks, columns=['Track'])

### All independent of submission type

In [None]:
# group submittaded talks by track and count the number of submissions
talks_quantification_by_domain_expertise = talks_df.groupby(['Track', 'Q: Expected audience expertise: Domain']).size().unstack(fill_value=0)
talks_quantification_by_domain_expertise = tracks_df.join(talks_quantification_by_domain_expertise, on='Track')
talks_quantification_by_domain_expertise['Total'] = talks_quantification_by_domain_expertise[['None', 'Novice', 'Intermediate', 'Advanced']].sum(axis=1)
talks_quantification_by_domain_expertise['Total %'] = (talks_quantification_by_domain_expertise['Total'] / talks_quantification_by_domain_expertise['Total'].sum() * 100).round(2)
talks_quantification_by_domain_expertise['Main Track'] = talks_quantification_by_domain_expertise['Track'].apply(lambda x: x.split(':')[0] if ':' in x else x)
talks_quantification_by_domain_expertise['Total % per Main Track'] = talks_quantification_by_domain_expertise.groupby('Main Track')['Total'].transform(lambda x: (x / x.sum() * 100).round(2))

# reorder columns
talks_quantification_by_domain_expertise = talks_quantification_by_domain_expertise[['Main Track', 'Track', 'Total', 'Total %', 'Total % per Main Track', 'None', 'Novice', 'Intermediate', 'Advanced']]
talks_quantification_by_python_expertise = talks_df.groupby(['Track', 'Q: Expected audience expertise: Python']).size().unstack(fill_value=0)
talks_quantification_by_python_expertise = tracks_df.join(talks_quantification_by_python_expertise, on='Track')
talks_quantification_by_python_expertise['Main Track'] = talks_quantification_by_python_expertise['Track'].apply(lambda x: x.split(':')[0] if ':' in x else x)
talks_quantification_by_python_expertise = talks_quantification_by_python_expertise[['Main Track', 'Track', 'None', 'Novice', 'Intermediate', 'Advanced']]

# join talks_quantification_by_domain_expertise and talks_quantification_by_python_expertise and keep add a group column name fir the expertise level
talks_quantification = pd.merge(talks_quantification_by_domain_expertise, talks_quantification_by_python_expertise, on=['Main Track', 'Track'], how='outer')

talks_quantification.columns = pd.MultiIndex.from_tuples([
    ('', col) if (col == 'Track') | (col == 'Total') | (col == 'Total %') | (col == 'Total % per Main Track') | (col == 'Main Track') else 
    ('Expected Domain Expertise by Audience', col.rstrip("_xy")) if col.endswith('_x') else 
    ('Expected Python Expertise by Audience', col.rstrip("_xy")) 
    for col in talks_quantification.columns
])

# fill NaN values with 0
talks_quantification.fillna(0, inplace=True)

talks_quantification

In [None]:
# Compress overall table for plotting
talks_quantification_condensed = talks_quantification.copy()

talks_quantification_condensed['', 'Expected Domain Expertise by Audience'] = talks_quantification_condensed['Expected Domain Expertise by Audience'].to_numpy().tolist()
talks_quantification_condensed['', 'Expected Python Expertise by Audience'] = talks_quantification_condensed['Expected Python Expertise by Audience'].to_numpy().tolist()

talks_quantification_condensed = talks_quantification_condensed.drop(columns=['Expected Domain Expertise by Audience', 'Expected Python Expertise by Audience'], level=0)
talks_quantification_condensed.columns = talks_quantification_condensed.columns.droplevel(0)

# helper functions for plotting
def cell_histogram_with_labels(values, global_max_value=None):
    max_value = max(values) if global_max_value is None else global_max_value  # Maximalwert für Skalierung
    bar_heights = [100 / len(values)] * len(values)  # Gleichmäßige Balkenhöhen (in Prozent)
    bars = ""
    labels = ['None', 'Novice', 'Intermediate', 'Advanced']
    for i, value in enumerate(values):
        label = labels[i]
        bar_width = (value / max_value) * 100 if max_value > 0 else 0  # Width
        y_position = i * bar_heights[0]  # Y-Position of each bar
        # Rechteck (Bar)
        bars += f'<rect x="0" y="{y_position}%" width="{bar_width}%" height="{bar_heights[0]}%" style="fill:#d65f5f50;" />'
        # Text (Label)
        bars += f'<text x="2" y="{y_position + bar_heights[0] / 1.8}%" dominant-baseline="middle" font-size="10" fill="black">{label} ({int(value)})</text>'
    
    svg = f"""
    <svg width="100" height="50" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 50">
        {bars}
    </svg>
    """
    return svg

def single_value_histogram(value, max_value):
    # Calculate the width of the bar as a percentage
    bar_width = (value / max_value) * 100 if max_value > 0 else 0
    
    # Generate the SVG
    svg = f"""
    <svg width="100" height="50" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 20">
        <!-- Rectangle (Bar) -->
        <rect x="0" y="-20" width="{bar_width}%" height="80" style="fill:#d65f5f50;" />
        <!-- Text (Label) -->
        <text x="5" y="15" font-size="14" fill="black">{round(value, 2)}%</text>
    </svg>
    """
    return svg

# Generate output
title = f'All {int(talks_quantification_condensed['Total'].sum())} submitted talks, long talks and tutorials (excluding pending submissions) <br> ****'

talks_quantification_condensed_styled = talks_quantification_condensed.style \
    .set_caption(title) \
    .set_table_styles([
        {'selector': 'caption', 'props': [('font-family', 'Arial'), ('font-size', '20px'), ('font-weight', 'bold')]},
        {'selector': 'th', 'props': [('font-family', 'Arial'), ('max-width', '160px')]}
    ]) \
    .set_properties(**{'font-family': 'Arial'}) \
    .format({
        ('Total'): '{:.0f}',
        ('Total %'): lambda value: single_value_histogram(
            value,
            talks_quantification_condensed['Total %'].max()
        ),
        ('Total % per Main Track'): lambda value: single_value_histogram(
            value,
            talks_quantification_condensed['Total % per Main Track'].max()
        ),
        'Expected Domain Expertise by Audience': lambda values: cell_histogram_with_labels(
            values,
            np.concatenate(talks_quantification_condensed['Expected Domain Expertise by Audience'].to_numpy()).max()
        ),
        'Expected Python Expertise by Audience': lambda values: cell_histogram_with_labels(
            values,
            np.concatenate(talks_quantification_condensed['Expected Python Expertise by Audience'].to_numpy()).max()
        ),
    })

talks_quantification_condensed_styled.to_html('talks_quantification.html', index=False, escape=False)

talks_quantification_condensed_styled

### Stats for Talks

### Stats for Tutorials

### Stats for Talks (long)

# Future Todos
- Compare against historical events
- Split by submission type
- make independent of submission type