In [54]:
import pickle
import yaml
import pandas as pd

In [59]:
def get_report(response):
    report = []
    for result in response.call_results:
        resp = result.parsed_response['results']
        for item in resp:
            item['file'] = result.files_evaluated[0] 
            report.append(item)
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path):
    with open(resp_path, 'rb') as file:
        response = pickle.load(file)
    df = (
        get_report(response)
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
        .reset_index()
    )
    df['response_path'] = resp_path
    return df

In [58]:
with open('../data/processed/batch_run/record_combine.yml', 'r') as file:
    config = pd.DataFrame(yaml.safe_load(file))
config['response_path'].iloc[0]

'../data/processed/batch_run/lightfm_01.pickle'

In [64]:
res = [
    extract_file_and_scores(path) for path in config['response_path']
]
res = pd.concat(res, axis=0).reset_index(drop=True)

In [74]:
df_repo_run_file = config.merge(res, on='response_path', how='left')
df_repo_run_file

Unnamed: 0,repo,response_path,run,file,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_api.py,0.0,0.5,0.5,0.0,0.5,0.0,0.0
1,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_cross_va...,0.5,0.0,0.0,0.0,0.0,0.0,0.0
2,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_data.py,0.5,0.5,0.0,0.0,0.0,0.0,0.0
3,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_datasets.py,1.0,0.5,0.0,0.5,0.0,0.0,0.0
4,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_evaluati...,0.0,0.0,0.0,0.5,0.0,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1267,qlib,../data/processed/batch_run/qlib_06.pickle,6,../data/raw/openja/qlib/tests/test_get_data.py,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1268,qlib,../data/processed/batch_run/qlib_06.pickle,6,../data/raw/openja/qlib/tests/test_pit.py,1.0,0.0,0.0,0.0,0.0,0.5,0.0
1269,qlib,../data/processed/batch_run/qlib_06.pickle,6,../data/raw/openja/qlib/tests/test_register_op...,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1270,qlib,../data/processed/batch_run/qlib_06.pickle,6,../data/raw/openja/qlib/tests/test_structured_...,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

In [102]:
df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
    id: ['max'] for id in checklist_ids
})
df_repo_run.columns = [col[0] for col in df_repo_run.columns]
df_repo_run = df_repo_run.reset_index()
df_repo_run

Unnamed: 0,repo,run,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,1,1.0,0.5,0.5,0.5,0.5,1.0,1.0
1,lightfm,2,1.0,0.5,0.0,0.5,0.0,0.5,1.0
2,lightfm,3,1.0,0.5,1.0,0.5,0.5,0.5,0.5
3,lightfm,4,1.0,0.5,1.0,0.5,0.5,1.0,1.0
4,lightfm,5,1.0,0.5,0.0,0.5,0.0,0.5,1.0
5,lightfm,6,1.0,0.5,0.0,0.5,0.0,1.0,1.0
6,lightfm,7,1.0,0.5,0.0,0.5,0.0,0.5,1.0
7,lightfm,8,1.0,0.5,0.5,0.5,0.5,0.5,1.0
8,lightfm,9,1.0,0.5,0.0,0.5,0.0,1.0,1.0
9,lightfm,10,1.0,0.5,0.0,0.5,0.0,0.5,1.0


In [134]:
df_repo__stat = df_repo_run.groupby(['repo']).agg({
    id: ['mean', 'std', 'count'] for id in checklist_ids
})

df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
df_repo__stat = (
    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)
df_repo__stat

Unnamed: 0,repo,id,count,mean,std
0,lightfm,2.1,30.0,0.983333,0.091287
1,lightfm,3.2,30.0,0.483333,0.091287
2,lightfm,3.5,30.0,0.266667,0.38804
3,lightfm,4.2,30.0,0.55,0.152564
4,lightfm,5.3,30.0,0.2,0.249136
5,lightfm,6.1,30.0,0.666667,0.239732
6,lightfm,6.2,30.0,0.933333,0.172873
7,qlib,2.1,10.0,1.0,0.0
8,qlib,3.2,10.0,0.85,0.241523
9,qlib,3.5,10.0,0.0,0.0


In [None]:
import altair as alt
alt.__version__

In [168]:
lightfm_gt = pd.DataFrame([
    {'id': '2.1', 'score': 1},
    {'id': '3.2', 'score': 1},
    {'id': '3.5', 'score': 0},
    {'id': '4.2', 'score': 1},
    {'id': '5.3', 'score': 0.5},
    {'id': '6.1', 'score': 1},
    {'id': '6.2', 'score': 1},
])

In [178]:
def generate_stat_plot(df_repo__stat, repo, ground_truth=None):
    # the base chart
    base = alt.Chart(df_repo__stat.query(f'repo == "{repo}"')).transform_calculate(
        min="max(0, datum.mean-datum.std)",
        max="min(1, datum.mean+datum.std)"
    )
    
    # generate the points
    points = base.mark_point(
        filled=True,
        size=50,
        color='black'
    ).encode(
        x=alt.X('id:O').axis(labelAngle=0).title('Checklist Id'),
        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
    )
    
    # generate the error bars
    errorbars = base.mark_errorbar().encode(
        x="id:O",
        y=alt.Y("min:Q").title('1 SD'),
        y2="max:Q"
    )

    plot = points + errorbars
    
    if ground_truth is not None:
        # generate points of ground truth
        gt_points = alt.Chart(ground_truth).mark_point(
            filled=True,
            size=100,
            color='green',
            shape="diamond"
        ).encode(
            x=alt.X('id:O'),
            y=alt.Y('score:Q')
        )

        plot += gt_points

    return plot.properties(width=400)

generate_stat_plot(df_repo__stat, "lightfm", lightfm_gt)
#generate_stat_plot(df_repo__stat, "qlib")

In [121]:
from collections import Counter
df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
for id in checklist_ids[1:]:
    df_repo__count = df_repo__count.merge(
        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
        on=['repo', 'level_1'],
        how='outer'
    )

df_repo__count = df_repo__count.fillna(0)
df_repo__count #.query('repo == "lightfm"')

Unnamed: 0,repo,level_1,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,0.0,0.0,1.0,19.0,0.0,18.0,0.0,0.0
1,lightfm,0.5,1.0,29.0,6.0,27.0,12.0,20.0,4.0
2,lightfm,1.0,29.0,0.0,5.0,3.0,0.0,10.0,26.0
3,qlib,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
4,qlib,0.5,0.0,3.0,0.0,10.0,3.0,2.0,6.0
5,qlib,1.0,10.0,7.0,0.0,0.0,7.0,8.0,4.0


In [105]:
df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()

Unnamed: 0,repo,level_1,2.1
0,lightfm,1.0,29.0
1,lightfm,0.5,1.0
2,qlib,1.0,10.0
3,qlib,0.5,


In [124]:
get_report(response)

Unnamed: 0,ID,Title,Requirement,Observation,Functions,Evaluation,Score,file
0,2.1,Ensure Data File Loads as Expected,Ensure that data-loading functions correctly f...,The code includes tests for fitting data into ...,"[test_fitting, test_fitting_no_identity, test_...",Partially Satisfied,0.5,../data/raw/openja/lightfm/tests/test_data.py
1,3.2,Data in the Expected Format,Verify that the data matches the expected form...,The code includes tests for building user feat...,[test_build_features],Partially Satisfied,0.5,../data/raw/openja/lightfm/tests/test_data.py
2,3.5,Check for Duplicate Records in Data,Verify that there are no duplicate records in ...,No specific test related to checking for dupli...,[],Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_data.py
3,4.2,Verify Data Split Proportion,Check that the data is split into training and...,No specific test related to verifying data spl...,[],Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_data.py
4,5.3,Ensure Model Output Shape Aligns with Expectation,Ensure that the structure of the model's outpu...,No specific test related to ensuring model out...,[],Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_data.py
5,6.1,Verify Evaluation Metrics Implementation,Verify that the evaluation metrics are correct...,No specific test related to verifying evaluati...,[],Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_data.py
6,6.2,Evaluate Model's Performance Against Thresholds,Compute evaluation metrics for both the traini...,No specific test related to evaluating model's...,[],Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_data.py
7,2.1,Ensure Data File Loads as Expected,Ensure that data-loading functions correctly f...,The code imports necessary libraries and defin...,"[_assert_disjoint, test_random_train_test_split]",Partially Satisfied,0.5,../data/raw/openja/lightfm/tests/test_cross_va...
8,3.2,Data in the Expected Format,Verify that the data matches the expected form...,,,Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_cross_va...
9,3.5,Check for Duplicate Records in Data,Verify that there are no duplicate records in ...,,,Not Satisfied,0.0,../data/raw/openja/lightfm/tests/test_cross_va...


'5.3.0'