#### NOTE: the batch run is based on the code base [abb9a21](https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0), which is similar to the commit [69d61a9](https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2) in the main branch

In [1]:
import pickle
import yaml
import pandas as pd
import altair as alt

In [2]:
def get_report(response):
    report = []
    for result in response.call_results:
        if result.parsed_response:
            resp = result.parsed_response['results']
            for item in resp:
                item['file'] = result.files_evaluated[0] 
                item['success'] = result.success
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path):
    #print(resp_path)
    with open(resp_path, 'rb') as file:
        response = pickle.load(file)
    report = get_report(response)
    df = (
        report
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
    )
    df['success'] = report.groupby(['file'])['success'].all()
    df['response_path'] = resp_path
    return df.reset_index()

def generate_stat_plot(df_repo__stat, repo, ground_truth=None):
    # the base chart
    base = alt.Chart(df_repo__stat.query(f'repo == "{repo}"')).transform_calculate(
        min="max(0, datum.mean-datum.std)",
        max="min(1, datum.mean+datum.std)"
    )
    
    # generate the points
    points = base.mark_point(
        filled=True,
        size=50,
        color='black'
    ).encode(
        x=alt.X('id:O').axis(labelAngle=0).title('Checklist Id'),
        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
    )
    
    # generate the error bars
    errorbars = base.mark_errorbar().encode(
        x="id:O",
        y=alt.Y("min:Q").title('1 SD'),
        y2="max:Q"
    )

    plot = points + errorbars
    
    if ground_truth is not None:
        # generate points of ground truth
        gt_points = alt.Chart(ground_truth).mark_point(
            filled=True,
            size=100,
            color='green',
            shape="diamond"
        ).encode(
            x=alt.X('id:O'),
            y=alt.Y('score:Q')
        )

        plot += gt_points

    return plot.properties(width=400)

### Scores by file, by run, by repo

In [3]:
with open('../data/processed/batch_run/record_combine.yml', 'r') as file:
    config = pd.DataFrame(yaml.safe_load(file))
#config['response_path'].iloc[0]

tmp = [
    extract_file_and_scores(path) for path in config['response_path']
]
tmp = pd.concat(tmp, axis=0).reset_index(drop=True)

df_repo_run_file = config.merge(tmp, on='response_path', how='left')
df_repo_run_file

Unnamed: 0,repo,response_path,run,file,2.1,3.2,3.5,4.2,5.3,6.1,6.2,success
0,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_api.py,0.0,0.5,0.5,0.0,0.5,0.0,0.0,True
1,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_cross_va...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,True
2,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_data.py,0.5,0.5,0.0,0.0,0.0,0.0,0.0,True
3,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_datasets.py,1.0,0.5,0.0,0.5,0.0,0.0,0.0,True
4,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_evaluati...,0.0,0.0,0.0,0.5,0.0,0.5,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
5575,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/coco...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
5576,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/musi...,0.0,,,,,,,False
5577,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/onse...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,True
5578,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/scor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [4]:
df_repo_run_file[~df_repo_run_file.success]['file'].unique()

array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py'],
      dtype=object)

In [116]:
pd.set_option('display.max_rows', 300)
df_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()
#df_repo_run_file.query('repo == "qlib"').groupby('file')['run'].count() #.groupby('run').count()

Unnamed: 0,repo,file
0,lightfm,7
1,magenta,8
2,mmf,70
3,nanodet,42
4,qlib,31


In [6]:
df_repo_run_file.query('(repo == "qlib") & (file == "../data/raw/openja/qlib/tests/backtest/test_file_strategy.py")')

Unnamed: 0,repo,response_path,run,file,2.1,3.2,3.5,4.2,5.3,6.1,6.2,success
238,qlib,../data/processed/batch_run/qlib_27.pickle,27,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
297,qlib,../data/processed/batch_run/qlib_28.pickle,28,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
356,qlib,../data/processed/batch_run/qlib_29.pickle,29,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
415,qlib,../data/processed/batch_run/qlib_30.pickle,30,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
474,qlib,../data/processed/batch_run/qlib_01.pickle,1,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.5,0.0,True
533,qlib,../data/processed/batch_run/qlib_02.pickle,2,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
592,qlib,../data/processed/batch_run/qlib_03.pickle,3,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
651,qlib,../data/processed/batch_run/qlib_04.pickle,4,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
710,qlib,../data/processed/batch_run/qlib_05.pickle,5,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
769,qlib,../data/processed/batch_run/qlib_06.pickle,6,../data/raw/openja/qlib/tests/backtest/test_fi...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [7]:
# FIXME: clean non-test files (mainly in qlib)
df_repo_run_file = df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')

In [121]:
df_repo_run_file.query('repo == "nanodet"')['file'].unique()

array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py',
       '../data/raw/openja/nanodet/tests/test_data/test_collate.py',
       '../data/raw/openja/nanodet/tests/test_data/test_dataset/test_cocodataset.py',
       '../data/raw/openja/nanodet/tests/test_data/test_dataset/test_xmldataset.py',
       '../data/raw/openja/nanodet/tests/test_data/test_dataset/test_yolodataset.py',
       '../data/raw/openja/nanodet/tests/test_data/test_transform/test_color.py',
       '../data/raw/openja/nanodet/tests/test_data/test_transform/test_warp.py',
       '../data/raw/openja/nanodet/tests/test_evaluator/test_coco_detection.py',
       '../data/raw/openja/nanodet/tests/test_models/test_backbone/test_custom_csp.py',
       '../data/raw/openja/nanodet/tests/test_models/test_backbone/test_efficient_lite.py',
       '../data/raw/openja/nanodet/tests/te

In [8]:
df_repo_run_file.success.all()

False

In [9]:
#df_repo_run_file[df_repo_run_file.success]['file'].unique()

### Scores by run, by repo

In [10]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

In [11]:
df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
    id: ['max'] for id in checklist_ids
})
df_repo_run.columns = [col[0] for col in df_repo_run.columns]
df_repo_run = df_repo_run.reset_index()
df_repo_run

Unnamed: 0,repo,run,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,1,1.0,0.5,0.5,0.5,0.5,1.0,1.0
1,lightfm,2,1.0,0.5,0.0,0.5,0.0,0.5,1.0
2,lightfm,3,1.0,0.5,1.0,0.5,0.5,0.5,0.5
3,lightfm,4,1.0,0.5,1.0,0.5,0.5,1.0,1.0
4,lightfm,5,1.0,0.5,0.0,0.5,0.0,0.5,1.0
5,lightfm,6,1.0,0.5,0.0,0.5,0.0,1.0,1.0
6,lightfm,7,1.0,0.5,0.0,0.5,0.0,0.5,1.0
7,lightfm,8,1.0,0.5,0.5,0.5,0.5,0.5,1.0
8,lightfm,9,1.0,0.5,0.0,0.5,0.0,1.0,1.0
9,lightfm,10,1.0,0.5,0.0,0.5,0.0,0.5,1.0


### stat(Score) by repo

In [12]:
df_repo__stat = df_repo_run.groupby(['repo']).agg({
    id: ['mean', 'std', 'count'] for id in checklist_ids
})

df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
df_repo__stat = (
    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)
df_repo__stat

Unnamed: 0,repo,id,count,mean,std
0,lightfm,2.1,30.0,0.983333,0.091287
1,lightfm,3.2,30.0,0.483333,0.091287
2,lightfm,3.5,30.0,0.266667,0.38804
3,lightfm,4.2,30.0,0.55,0.152564
4,lightfm,5.3,30.0,0.2,0.249136
5,lightfm,6.1,30.0,0.666667,0.239732
6,lightfm,6.2,30.0,0.933333,0.172873
7,magenta,2.1,30.0,0.6,0.423451
8,magenta,3.2,30.0,0.416667,0.189525
9,magenta,3.5,30.0,0.283333,0.252003


In [13]:
lightfm_gt = pd.DataFrame([
    {'id': '2.1', 'score': 1},
    {'id': '3.2', 'score': 1},
    {'id': '3.5', 'score': 0},
    {'id': '4.2', 'score': 1},
    {'id': '5.3', 'score': 0.5},
    {'id': '6.1', 'score': 1},
    {'id': '6.2', 'score': 1},
])

In [14]:
generate_stat_plot(df_repo__stat, "lightfm", lightfm_gt)

In [15]:
generate_stat_plot(df_repo__stat, "qlib")

In [16]:
generate_stat_plot(df_repo__stat, "mmf")

In [17]:
generate_stat_plot(df_repo__stat, "nanodet")

In [18]:
generate_stat_plot(df_repo__stat, "magenta")

In [19]:
#generate_stat_plot(df_repo__stat, "qlib") # before filtering

### count(Score) by repo

In [20]:
from collections import Counter
df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
for id in checklist_ids[1:]:
    df_repo__count = df_repo__count.merge(
        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
        on=['repo', 'level_1'],
        how='outer'
    )

df_repo__count = df_repo__count.fillna(0)
df_repo__count #.query('repo == "lightfm"')

Unnamed: 0,repo,level_1,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,0.0,0.0,1.0,19.0,0.0,18.0,0.0,0.0
1,lightfm,0.5,1.0,29.0,6.0,27.0,12.0,20.0,4.0
2,lightfm,1.0,29.0,0.0,5.0,3.0,0.0,10.0,26.0
3,magenta,0.0,8.0,5.0,13.0,29.0,30.0,17.0,30.0
4,magenta,0.5,8.0,25.0,17.0,1.0,0.0,13.0,0.0
5,magenta,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0
6,mmf,0.0,0.0,0.0,28.0,0.0,0.0,0.0,1.0
7,mmf,0.5,0.0,19.0,2.0,28.0,28.0,9.0,25.0
8,mmf,1.0,30.0,11.0,0.0,2.0,2.0,21.0,4.0
9,nanodet,0.0,0.0,0.0,17.0,15.0,0.0,7.0,9.0


### consistency examination

In [28]:
df_repo__stat.groupby('id')['std'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2.1,5.0,0.102948,0.183475,0.0,0.0,0.0,0.091287,0.423451
3.2,5.0,0.210257,0.074752,0.091287,0.189525,0.239732,0.245066,0.285673
3.5,5.0,0.222038,0.117801,0.091287,0.126854,0.252003,0.252003,0.38804
4.2,5.0,0.155509,0.060658,0.091287,0.126854,0.152564,0.152564,0.254274
5.3,5.0,0.12594,0.125723,0.0,0.0,0.126854,0.249136,0.253708
6.1,5.0,0.247068,0.023755,0.224888,0.233046,0.239732,0.252003,0.285673
6.2,5.0,0.16343,0.097474,0.0,0.172873,0.182574,0.201289,0.260415


In [32]:
alt.Chart(df_repo__stat).mark_boxplot().encode(
    x="std:Q",
    y='id:N'
).properties(
    height=200,
    width=400
)
# Obs: 2.1 is particularly stable. When evaluating a repository, it usually has the lowest variance of scores.

In [56]:
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')
stds

Unnamed: 0_level_0,std,std,std,std,std,std,std
id,2.1,3.2,3.5,4.2,5.3,6.1,6.2
repo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
lightfm,0.091287,0.091287,0.38804,0.152564,0.249136,0.239732,0.172873
magenta,0.423451,0.189525,0.252003,0.091287,0.0,0.252003,0.0
mmf,0.0,0.245066,0.126854,0.126854,0.126854,0.233046,0.201289
nanodet,0.0,0.239732,0.252003,0.254274,0.0,0.285673,0.260415
qlib,0.0,0.285673,0.091287,0.152564,0.253708,0.224888,0.182574


In [57]:
F = stds.iloc[1:] / stds.iloc[0]
F

Unnamed: 0_level_0,std,std,std,std,std,std,std
id,2.1,3.2,3.5,4.2,5.3,6.1,6.2
repo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
magenta,4.638668,2.076137,0.649427,0.598352,0.0,1.05119,0.0
mmf,0.0,2.684566,0.32691,0.831479,0.509175,0.972111,1.164375
nanodet,0.0,2.626129,0.649427,1.666667,0.0,1.191638,1.506397
qlib,0.0,3.129393,0.235252,1.0,1.01835,0.938083,1.056118


In [83]:
base = alt.Chart(
    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]
).transform_calculate(
    benchmark="1",
)

point = base.mark_point(
    filled=True,
    size=100,
).encode(
    x=alt.X('value:Q').title("std"),
    y='id:N',
    color='repo'
).properties(
    height=200,
    width=400
)

base.mark_rule(color='black').encode(x="benchmark:Q") + point

In [94]:
F.index

Index(['magenta', 'mmf', 'nanodet', 'qlib'], dtype='object', name='repo')

In [100]:
import scipy

p_value = pd.DataFrame(1 - scipy.stats.f.cdf(F, 29, 29)) 
p_value.columns = [x[1] for x in F.columns]
p_value.index = F.index
p_value

Unnamed: 0_level_0,2.1,3.2,3.5,4.2,5.3,6.1,6.2
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
magenta,4.4e-05,0.026853,0.874501,0.913657,1.0,0.446994,1.0
mmf,1.0,0.004858,0.998208,0.688747,0.962891,0.530094,0.34235
nanodet,1.0,0.005699,0.874501,0.087487,1.0,0.319986,0.13786
qlib,1.0,0.001493,0.999898,0.5,0.480643,0.567735,0.442062


In [112]:
base = alt.Chart(
    p_value.melt(
        var_name="id", 
        ignore_index=False
    ).reset_index()
).transform_calculate(
    benchmark="0.5",
    threshold="0.025",
)

point = base.mark_point(
    filled=True,
    size=100,
).encode(
    x=alt.X('value:Q').title("p-value"),
    y='id:N',
    color='repo'
).properties(
    height=200,
    width=400
)

point \
+ base.mark_rule(color='black').encode(x="benchmark:Q") \
+ base.mark_rule(color='red').encode(x="threshold:Q")
#point

In [48]:
(p_value < 0.025) | (p_value > 0.0975)

Unnamed: 0,0,1,2,3,4,5,6
0,True,False,True,True,True,True,True
1,True,True,True,True,True,True,True
2,True,True,True,False,True,True,True
3,True,True,True,True,True,True,True


In [22]:
with open("../data/processed/batch_run/magenta_01.pickle", 'rb') as file:
    response = pickle.load(file)

pd.DataFrame(response.call_results[0].parsed_response['results'])

Unnamed: 0,ID,Title,Requirement,Observation,Functions,Evaluation,Score
0,2.1,Ensure Data File Loads as Expected,Ensure that data-loading functions correctly f...,The code does not contain any data-loading fun...,[],Not Satisfied,0
1,3.2,Data in the Expected Format,Verify that the data matches the expected form...,The code does not contain any data format veri...,[],Not Satisfied,0
2,3.5,Check for Duplicate Records in Data,Verify that there are no duplicate records in ...,The code does not include any checks for dupli...,[],Not Satisfied,0
3,4.2,Verify Data Split Proportion,Check that the data is split into training and...,No data splitting or proportion verification f...,[],Not Satisfied,0
4,5.3,Ensure Model Output Shape Aligns with Expectation,Ensure that the structure of the model's outpu...,There is no code related to verifying the mode...,[],Not Satisfied,0
5,6.1,Verify Evaluation Metrics Implementation,Verify that the evaluation metrics are correct...,The code does not contain any implementation o...,[],Not Satisfied,0
6,6.2,Evaluate Model's Performance Against Thresholds,Compute evaluation metrics for both the traini...,No code is present for evaluating the model's ...,[],Not Satisfied,0


In [23]:
response.call_results[2].parsed_response['results'][0]

{'ID': '2.1',
 'Title': 'Ensure Data File Loads as Expected',
 'Requirement': 'Ensure that data-loading functions correctly fetch datasets from predefined sources or online repositories. Additionally, verify that the functions handle errors or edge cases gracefully.',
 'Observation': 'The code does not directly address data-loading functions or error handling.',
 'Functions': [],
 'Evaluation': 'Not Satisfied',
 'Score': 0}