In [1]:
#!pip install scipy altair

In [2]:
import scipy
import pickle
import yaml
import pandas as pd
import altair as alt
from collections import Counter

In [3]:
def get_report(response):
    report = []
    for result in response.call_results:
        if result.parsed_response: # FIXME: filter out success == False instead of this
            resp = result.parsed_response['results']
            for item in resp:
                item['file'] = result.files_evaluated[0] 
                item['success'] = result.success
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path, verbose=False):
    if verbose:
        print(resp_path)
    with open(resp_path, 'rb') as file:
        response = pickle.load(file)
    report = get_report(response)
    df = (
        report
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
    )
    df['success'] = report.groupby(['file'])['success'].all()
    df['response_path'] = resp_path
    return df.reset_index()


In [4]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

result_path = '../data/processed/522_batch_run/record_combine.yml'
with open(result_path, 'r') as file:
    config = pd.DataFrame(yaml.safe_load(file))

# prepare score data by repo, run, file
tmp = [
    extract_file_and_scores(path, verbose=True) for path in config['response_path']
]
tmp = pd.concat(tmp, axis=0).reset_index(drop=True)

raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')

../data/processed/522_batch_run/group_01_01.pickle
../data/processed/522_batch_run/group_01_02.pickle
../data/processed/522_batch_run/group_01_03.pickle
../data/processed/522_batch_run/group_01_04.pickle
../data/processed/522_batch_run/group_01_05.pickle
../data/processed/522_batch_run/group_01_06.pickle
../data/processed/522_batch_run/group_01_07.pickle
../data/processed/522_batch_run/group_01_08.pickle
../data/processed/522_batch_run/group_01_09.pickle
../data/processed/522_batch_run/group_01_10.pickle
../data/processed/522_batch_run/group_01_11.pickle
../data/processed/522_batch_run/group_01_12.pickle
../data/processed/522_batch_run/group_01_13.pickle
../data/processed/522_batch_run/group_01_14.pickle
../data/processed/522_batch_run/group_01_15.pickle
../data/processed/522_batch_run/group_01_16.pickle
../data/processed/522_batch_run/group_01_17.pickle
../data/processed/522_batch_run/group_01_18.pickle
../data/processed/522_batch_run/group_01_19.pickle
../data/processed/522_batch_run

In [5]:
# prepare score data by repo, run
df_repo_run = raw_df_repo_run_file.groupby(['repo', 'run']).agg({
    id: ['max'] for id in checklist_ids
})
df_repo_run.columns = [col[0] for col in df_repo_run.columns]
df_repo_run = df_repo_run.reset_index()

df_repo_run['total'] = df_repo_run.loc[:, "2.1":"6.2"].mean(axis=1)

In [6]:
df_repo_run

Unnamed: 0,repo,run,2.1,3.2,3.5,4.2,5.3,6.1,6.2,total
0,group_01,1,1.0,1.0,0.0,0.5,0.0,0.0,0.0,0.357143
1,group_01,2,1.0,1.0,0.0,0.5,0.0,0.0,0.0,0.357143
2,group_01,3,1.0,1.0,0.0,0.5,0.0,0.0,0.0,0.357143
3,group_01,4,1.0,1.0,0.0,0.5,0.0,0.0,0.0,0.357143
4,group_01,5,1.0,1.0,0.0,0.5,0.0,0.0,0.0,0.357143
...,...,...,...,...,...,...,...,...,...,...
595,group_23,26,1.0,1.0,0.0,0.0,0.0,0.5,0.0,0.357143
596,group_23,27,1.0,1.0,0.0,0.0,0.0,0.5,0.0,0.357143
597,group_23,28,1.0,1.0,0.0,0.0,0.0,0.5,0.0,0.357143
598,group_23,29,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.285714


In [7]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2', 'total']

# prepare statistics of scores by repo
df_repo__stat = df_repo_run.groupby(['repo']).agg({
    id: ['mean', 'std', 'count'] for id in checklist_ids
})
df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
df_repo__stat = (
    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)
df_repo__stat

Unnamed: 0,repo,id,count,mean,std
0,group_01,2.1,30.0,1.000000,0.000000
1,group_01,3.2,30.0,1.000000,0.000000
2,group_01,3.5,30.0,0.000000,0.000000
3,group_01,4.2,30.0,0.300000,0.249136
4,group_01,5.3,30.0,0.000000,0.000000
...,...,...,...,...,...
155,group_23,4.2,30.0,0.000000,0.000000
156,group_23,5.3,30.0,0.116667,0.215092
157,group_23,6.1,30.0,0.483333,0.206920
158,group_23,6.2,30.0,0.000000,0.000000


In [8]:
base = alt.Chart(df_repo__stat.query('id == "total"')).transform_calculate(
    min="max(0, datum.mean-datum.std)",
    max="min(1, datum.mean+datum.std)"
)
    
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='black'
).encode(
    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Total Completeness").axis(format='%'),#.axis(labelAngle=0),
    y=alt.Y('repo:N').title("522 Repos")#.scale(domainMin=0, domainMax=1).title('Score'),
)
    
# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("min:Q").title('1 SD'), #"id:N",
    x2="max:Q",
    y="repo:N"
)

(points + errorbars).properties(
    height=400,
    width=600,
    title="30 Runs (gpt-3.5-turbo) on each DSCI 522 Repository (Python only)"
)

In [9]:
df_id__stat = df_repo__stat.groupby(['id']).agg({
    'mean': ['mean', 'std', 'count']    
})
df_id__stat = pd.melt(df_id__stat.reset_index(), id_vars=[('id', '')])
df_id__stat.columns = ['id', 'mean', 'stat', 'value']
df_id__stat = (
    df_id__stat.pivot(index=['id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)
df_id__stat

Unnamed: 0,id,count,mean,std
0,2.1,20.0,0.871667,0.270212
1,3.2,20.0,0.68,0.297877
2,3.5,20.0,0.0425,0.18618
3,4.2,20.0,0.266983,0.33563
4,5.3,20.0,0.039167,0.068029
5,6.1,20.0,0.338333,0.317846
6,6.2,20.0,0.091667,0.159632
7,total,20.0,0.333452,0.095465


In [10]:
base = alt.Chart(df_id__stat.query('id != "total"')).transform_calculate(
    min="max(0, datum.mean-datum.std)",
    max="min(1, datum.mean+datum.std)"
)
    
# generate the points
points = base.mark_point(
    filled=True,
    size=50,
    color='black'
).encode(
    x=alt.X('mean:Q').scale(domainMin=0, domainMax=1).title("Completeness").axis(
        labelExpr="datum.value % 0.5 ? null : datum.label"
    ),
    y=alt.Y('id:N').title("Checklist Id")
)
    
# generate the error bars
errorbars = base.mark_errorbar().encode(
    x=alt.X("min:Q").title('1 SD'),
    x2="max:Q",
    y="id:N"
)

(points + errorbars).properties(
    height=300,
    width=500,
    title={
        'text': "Completeness Score by Checklist Items",
        'subtitle': "over all DSCI 522 Repositories"
    }
)

In [11]:
df_repo_run['run'].iloc[[1,31]].unique()[0]

2