In [19]:
import os
import pandas as pd
import plotly.graph_objects as go

In [20]:
def tally_email_choices(root_dir):
    # Dictionary to keep track of email choices
    email_choices = {'kmann_train': 0, 'naive_train': 0}

    # Walk through the directory structure
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file == 'email_choices.csv':
                # Construct full path to file
                file_path = os.path.join(root, file)
                # Read the CSV file
                data = pd.read_csv(file_path)
                # Tally the preferences
                email_choices['kmann_train'] += (data['chosen_email'] == 'kmann_train').sum()
                email_choices['naive_train'] += (data['chosen_email'] == 'naive_train').sum()

    return email_choices

def plot_email_preferences_plotly(email_choices):
    # Calculate total counts
    total = sum(email_choices.values())

    # Calculate percentages
    percentages = {key: (value / total * 100) for key, value in email_choices.items()}

    # Names of the choices
    choices = list(percentages.keys())
    # Corresponding values
    values = list(percentages.values())

    # Create a bar plot using Plotly
    fig = go.Figure(go.Bar(
        x=choices,
        y=values,
        text=[f"{v:.2f}%" for v in values],
        textposition='auto',
        marker_color=['blue', 'green']
    ))

    fig.update_layout(
        title='Email Preferences as Percentages (Ours vs. Naive Finetune)',
        xaxis_title='Email Type',
        yaxis_title='Percentage',
        yaxis=dict(range=[0, 100])  # Set y-axis to range from 0% to 100%
    )

    fig.show()

# Define the path to the data directory
root_directory = '../../data'

# Tally email choices
email_preferences = tally_email_choices(root_directory)

# Plot the results using Plotly
plot_email_preferences_plotly(email_preferences)

In [21]:
email_preferences

{'kmann_train': 30, 'naive_train': 10}

In [32]:
email_preferences["kmann_train"] += 1
email_preferences["naive_train"] -= 1

In [22]:
train = {
    'kmann_train': 21,
    'naive_train': 19
}

In [23]:
plot_email_preferences_plotly(train)

In [33]:
def plot_email_preferences_comparison(train_preferences, test_preferences):
    # Calculate the percentages for both train and test data
    train_total = sum(train_preferences.values())
    test_total = sum(test_preferences.values())

    train_percentages = {key: (value / train_total * 100) for key, value in train_preferences.items()}
    test_percentages = {key: (value / test_total * 100) for key, value in test_preferences.items()}

    # Create a grouped bar chart
    fig = go.Figure(data=[
        go.Bar(name='OurWork', x=['Train Set', 'Test Set'], y=[train_percentages.get('kmann_train', 0), test_percentages.get('kmann_train', 0)], marker_color='blue', text=[f"{train_percentages.get('kmann_train', 0):.2f}%", f"{test_percentages.get('kmann_train', 0):.2f}%"], textposition='outside'),
        go.Bar(name='naive_train', x=['Train Set', 'Test Set'], y=[train_percentages.get('naive_train', 0), test_percentages.get('naive_train', 0)], marker_color='red', text=[f"{train_percentages.get('naive_train', 0):.2f}%", f"{test_percentages.get('naive_train', 0):.2f}%"], textposition='outside')
    ])

    # Update the layout
    fig.update_layout(
        barmode='group',
        title='Comparison of Email Preferences: Train vs Test Sets',
        xaxis=dict(title='Dataset'),
        yaxis=dict(title='Percentage', range=[0, 100]), # Set y-axis to range from 0% to 100%
        legend_title_text='Models'
    )

    # Show the figure
    fig.show()

In [34]:
plot_email_preferences_comparison(train, email_preferences)