In [5]:
!pip install scipy altair

Collecting altair
  Using cached altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting toolz (from altair)
  Using cached toolz-0.12.1-py3-none-any.whl.metadata (5.1 kB)
Using cached altair-5.3.0-py3-none-any.whl (857 kB)
Using cached toolz-0.12.1-py3-none-any.whl (56 kB)
Installing collected packages: toolz, altair
Successfully installed altair-5.3.0 toolz-0.12.1


In [1]:
import scipy
import pickle
import json
import yaml
import pandas as pd
import altair as alt
from collections import Counter

In [2]:
def get_report(response):
    report = []
    for result in response.call_results:
        if result.parsed_response:
            resp = result.parsed_response['results']
            for item in resp:
                item['file'] = result.files_evaluated[0] 
                item['success'] = result.success
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def get_report_json(response):
    report = []
    for result in response['call_results']:
        if result['parsed_response']:
            resp = result['parsed_response']['results']
            for item in resp:
                item['file'] = result['files_evaluated'][0] 
                item['success'] = result['success']
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path, verbose=False):
    if verbose:
        print(resp_path)
    with open(resp_path, 'rb') as file:
        try:
            response = pickle.load(file)
            report = get_report(response)
        except:
            response = json.load(file)
            report = get_report_json(response)
    df = (
        report
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
    )
    df['success'] = report.groupby(['file'])['success'].all()
    df['response_path'] = resp_path
    return df.reset_index()

In [3]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

def read_and_preprocess(result_path):
    with open(result_path, 'r') as file:
        config = pd.DataFrame(yaml.safe_load(file))
    
    # prepare score data by repo, run, file
    tmp = [
        extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem
    ]
    tmp = pd.concat(tmp, axis=0).reset_index(drop=True)
    
    raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')

    # filter non-test files in qlib
    df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')
    
    # prepare score data by repo, run
    df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
        id: ['max'] for id in checklist_ids
    })
    df_repo_run.columns = [col[0] for col in df_repo_run.columns]
    df_repo_run = df_repo_run.reset_index()
    
    # prepare statistics of scores by repo
    df_repo__stat = df_repo_run.groupby(['repo']).agg({
        id: ['mean', 'std', 'count'] for id in checklist_ids
    })
    df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
    df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
    df_repo__stat = (
        df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
        .reset_index()
        .rename_axis(None, axis=1)
    )
    
    # prepare counting of scores by repo
    df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
    for id in checklist_ids[1:]:
        df_repo__count = df_repo__count.merge(
            df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
            on=['repo', 'level_1'],
            how='outer'
        )
    
    df_repo__count = df_repo__count.fillna(0)

    return (df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count)

In [4]:
# Ground truth
ground_truth = pd.DataFrame([
    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
    {'repo': 'qlib', 'id': '3.2', 'score': 1},
    {'repo': 'qlib', 'id': '3.5', 'score': 0},
    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
    {'repo': 'qlib', 'id': '5.3', 'score': 1},
    {'repo': 'qlib', 'id': '6.1', 'score': 1},
    {'repo': 'qlib', 'id': '6.2', 'score': 1},
    {'repo': 'DeepSpeech', 'id': '2.1', 'score': 0},
    {'repo': 'DeepSpeech', 'id': '3.2', 'score': 0},
    {'repo': 'DeepSpeech', 'id': '3.5', 'score': 0},
    {'repo': 'DeepSpeech', 'id': '4.2', 'score': 0},
    {'repo': 'DeepSpeech', 'id': '5.3', 'score': 0},
    {'repo': 'DeepSpeech', 'id': '6.1', 'score': 0},
    {'repo': 'DeepSpeech', 'id': '6.2', 'score': 0},
])

In [5]:
id_item_map = {
    '2.1': 'Ensure Data File Loads as Expected',
    '3.2': 'Data in the Expected Format',
    '3.5': 'Check for Duplicate Records in Data',
    '4.2': 'Verify Data Split Proportion',
    '5.3': 'Ensure Model Output Shape Aligns with Expectation',
    '6.1': 'Verify Evaluation Metrics Implementation',
    '6.2': "Evaluate Model's Performance Against Thresholds"
}

In [6]:
#result_path = '../draft/batch_run_results/record_combine.yml'
df_repo_run_file, df_repo_run, df_repo__stat, df_repo__count = read_and_preprocess(
    '../data/processed/batch_run/record_combine.yml'
)

### Accuracy: Contingency table

In [7]:
cont_table = pd.melt(
    df_repo_run.query('(repo == "lightfm")')[['repo', 'run', '3.5', '4.2', '5.3']], 
    id_vars=['repo', 'run'], var_name='id', value_name='System Output')
cont_table = pd.merge(cont_table, ground_truth, how='inner', on=['repo', 'id'])
cont_table = cont_table.rename(columns={'score': 'ground_truth'})
cont_table['title'] = cont_table['id'].apply(lambda x: id_item_map[x])
#cont_table = cont_table[['repo', 'title', 'ground_truth', 'System Output', 'run']]
cont_table = pd.pivot_table(cont_table, values='run', index=['repo', 'id', 'title', 'ground_truth'], columns=['System Output'], aggfunc='count', fill_value=0)
cont_table.index.names = ['Repository', 'ID', 'Title', 'Ground Truth']
cont_table.sort_index(level=3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,System Output,0.0,0.5,1.0
Repository,ID,Title,Ground Truth,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lightfm,3.5,Check for Duplicate Records in Data,0.0,19,6,5
lightfm,5.3,Ensure Model Output Shape Aligns with Expectation,0.5,18,12,0
lightfm,4.2,Verify Data Split Proportion,1.0,0,27,3


### Consistency: jitterbox plot

In [9]:
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id').copy()
stds.columns = [col[1] for col in stds.columns]
stds = stds.reset_index()
stds = stds.melt(id_vars='repo', var_name='id')
stds['title'] = stds['id'].apply(lambda x: id_item_map[x])

In [10]:
box = alt.Chart().mark_boxplot(
    color='grey',
    opacity=0.5,
    size=20,
).encode(
    x=alt.X('value:Q').title('System Output Uncertainty'),
    y=alt.Y('title:N', title=None, axis=alt.Axis(labelPadding=10, labelLimit=1000, grid=False))
)

stripplot = alt.Chart().mark_circle(size=100).encode(
    y=alt.Y( 
        'title:N',
        axis=alt.Axis(ticks=False, grid=True, labels=True), 
        scale=alt.Scale(), 
    ), 
    x='value:Q',
    yOffset="jitter:Q",
    color=alt.Color('id:N', legend=None),
    tooltip='repo'
).transform_calculate(
    # Generate Gaussian jitter with a Box-Muller transform
    jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
)

plot = alt.layer(
    box,
    stripplot,
    data=stds
).configure_view( 
    stroke=None
).configure_axis( 
    labelFontSize=12, 
    titleFontSize=12
).properties(
    height=300, 
    width=600,
    title="30 Runs on Openja's Repositories for each Checklist Item"
) 

In [11]:
plot

### improvement from 3.5 to 4o

In [13]:
#result_path = '../draft/batch_run_results/record_combine.yml'
df_repo_run_file_4o, df_repo_run_4o, df_repo_4o__stat, df_repo_4o__count = read_and_preprocess(
    '../data/processed/batch_run_4o/record_combine.yml'
)

In [14]:
df_repo_4o__stat

Unnamed: 0,repo,id,count,mean,std
0,lightfm,2.1,30.0,1.0,0.0
1,lightfm,3.2,30.0,1.0,0.0
2,lightfm,3.5,30.0,1.0,0.0
3,lightfm,4.2,30.0,1.0,0.0
4,lightfm,5.3,30.0,1.0,0.0
5,lightfm,6.1,30.0,1.0,0.0
6,lightfm,6.2,30.0,1.0,0.0


In [15]:
df_repo_4o__count

Unnamed: 0,repo,level_1,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,1.0,30,30,30,30,30,30,30


In [93]:
base = alt.Chart(df_repo__stat.query('id == "total"')).transform_calculate(
    min="max(0, datum.mean-datum.std)",
    max="min(1, datum.mean+datum.std)"
)
    
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='black'
).encode(
    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Total Completeness").axis(format='%'),#.axis(labelAngle=0),
    y=alt.Y('repo:N').title("522 Repos")#.scale(domainMin=0, domainMax=1).title('Score'),
)
    
# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("min:Q").title('1 SD'), #"id:N",
    x2="max:Q",
    y="repo:N"
)

(points + errorbars).properties(
    height=400,
    width=600,
    title="30 Runs (gpt-3.5-turbo) on each DSCI 522 Repository (Python only)"
)