In [1]:
#!pip install altair scipy

In [2]:
import altair as alt
import pandas as pd

df_repo__stat = pd.read_csv('score_stat_by_repo_3.5-turbo.csv')
gt = pd.read_csv('ground_truth.csv')
gt = gt.melt(id_vars=['id', 'title'], var_name='repo', value_name='ground_truth')

df_repo__stat_with_gt = df_repo__stat.merge(gt, on=['id', 'title', 'repo'])

base = alt.Chart(
    df_repo__stat_with_gt.query('repo in ["lightfm", "qlib", "DeepSpeech"]')
).transform_calculate(
    min="max(0, datum.mean-datum.std)",
    max="min(1, datum.mean+datum.std)"
)
    
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='black'
).encode(
    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
        labelExpr="datum.value % 0.5 ? null : datum.label"
    ),
    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
)

# generate the points for ground truth
gt_points = base.mark_point(
    filled=True,
    size=200,
    color='green',
    shape="diamond"
).encode(
    x=alt.X('ground_truth:Q'),
    y=alt.Y('id_title:N')
)

# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("min:Q").title('1 SD'), #"id:N",
    x2="max:Q",
    y="id_title:N"
)

(gt_points + points + errorbars).facet(
    column=alt.Column('repo:N').title(None)
).configure_axis( 
    labelFontSize=12, 
    titleFontSize=12
)

In [3]:
df_repo_run = pd.read_csv('score_by_repo_run_3.5-turbo.csv')

df_repo_run = df_repo_run.merge(gt, on=['id', 'title', 'repo'])

contingency_table = pd.pivot_table(
    df_repo_run,
    values='run', 
    index=['repo', 'id_title', 'ground_truth'], 
    columns=['score'],
    aggfunc='count', 
    fill_value=0
)
contingency_table.index.names = ['Repository', 'Checklist Item', 'Ground Truth']
contingency_table.sort_index(level=[0, 2])

Unnamed: 0_level_0,Unnamed: 1_level_0,score,0.0,0.5,1.0
Repository,Checklist Item,Ground Truth,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
lightfm,3.5. Check for Duplicate Records in Data,0.0,30,0,0
lightfm,5.3. Ensure Model Output Shape Aligns with Expectation,0.5,1,29,0
lightfm,2.1. Ensure Data File Loads as Expected,1.0,0,0,30
lightfm,3.2. Data in the Expected Format,1.0,0,30,0
lightfm,4.2. Verify Data Split Proportion,1.0,0,11,19
lightfm,6.1. Verify Evaluation Metrics Implementation,1.0,0,5,25
lightfm,6.2. Evaluate Model's Performance Against Thresholds,1.0,0,1,29
qlib,3.5. Check for Duplicate Records in Data,0.0,23,7,0
qlib,2.1. Ensure Data File Loads as Expected,0.5,0,0,30
qlib,4.2. Verify Data Split Proportion,0.5,3,25,2


In [4]:
stds = df_repo__stat[['repo', 'std', 'id_title']].pivot(index='repo', columns='id_title').copy()
stds.columns = [col[1] for col in stds.columns]
stds = stds.reset_index()
stds = stds.melt(id_vars='repo', var_name='id_title')

base = alt.Chart(stds)

box = base.mark_boxplot(
    color='grey',
    opacity=0.5,
    size=20,
).encode(
    x=alt.X('value:Q').title('Standard Deviation of Scores'),
    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))
)

stripplot = base.mark_circle(size=100).encode(
    y=alt.Y( 
        'id_title:N',
        axis=alt.Axis(ticks=False, grid=True, labels=True), 
        scale=alt.Scale(), 
    ), 
    x='value:Q',
    yOffset="jitter:Q",
    color=alt.Color('id_title:N', legend=None),
    tooltip='repo'
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)

(
    box + stripplot
).configure_view( 
    stroke=None
).configure_axis( 
    labelFontSize=12, 
    titleFontSize=12
).properties(
    height=300, 
    width=600,
    title="30 Runs on Openja's Repositories for each Checklist Item"
) 

In [5]:
df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4o.csv')
df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])
df_repo_4o__stat_with_gt['model'] = 'gpt-4o'

df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query("repo == 'lightfm'").copy()
df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'

df_model_comp = pd.concat(
    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), 
    axis=0
)

base = alt.Chart(
    df_model_comp
).transform_calculate(
    min="max(0, datum.mean-datum.std)",
    max="min(1, datum.mean+datum.std)"
)
    
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='black'
).encode(
    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
        labelExpr="datum.value % 0.5 ? null : datum.label"
    ),
    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
)

# generate the points for ground truth
gt_points = base.mark_point(
    filled=True,
    size=200,
    color='green',
    shape="diamond"
).encode(
    x=alt.X('ground_truth:Q'),
    y=alt.Y('id_title:N')
)

# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("min:Q").title('1 SD'), #"id:N",
    x2="max:Q",
    y="id_title:N"
)

(
    gt_points + points + errorbars
).facet(
    column=alt.Column(
        'model:N'
    ).title(
        "30 Runs on ALL Checklist Items"
    )
).configure_axis( 
    labelFontSize=12, 
    titleFontSize=12
)


In [6]:
df_repo_4o__stat = pd.read_csv('score_stat_by_repo_4-turbo.csv')
df_repo_4o__stat_with_gt = df_repo_4o__stat.merge(gt, on=['id', 'title', 'repo'])
df_repo_4o__stat_with_gt['model'] = 'gpt-4-turbo'

df_repo_35turbo__stat_with_gt = df_repo__stat_with_gt.query("repo == 'lightfm'").copy()
df_repo_35turbo__stat_with_gt['model'] = 'gpt-3.5-turbo'

df_model_comp = pd.concat(
    (df_repo_35turbo__stat_with_gt, df_repo_4o__stat_with_gt), 
    axis=0
)

base = alt.Chart(
    df_model_comp
).transform_calculate(
    min="max(0, datum.mean-datum.std)",
    max="min(1, datum.mean+datum.std)"
)
    
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='black'
).encode(
    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Score").axis(
        labelExpr="datum.value % 0.5 ? null : datum.label"
    ),
    y=alt.Y('id_title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))#.scale(domainMin=0, domainMax=1).title('Score'),
)

# generate the points for ground truth
gt_points = base.mark_point(
    filled=True,
    size=200,
    color='green',
    shape="diamond"
).encode(
    x=alt.X('ground_truth:Q'),
    y=alt.Y('id_title:N')
)

# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("min:Q").title('1 SD'), #"id:N",
    x2="max:Q",
    y="id_title:N"
)

(
    gt_points + points + errorbars
).facet(
    column=alt.Column(
        'model:N'
    ).title(
        "30 Runs on ALL Checklist Items"
    )
).configure_axis( 
    labelFontSize=12, 
    titleFontSize=12
)
