#### NOTE: the batch run is based on the code base [abb9a21](https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0), which is similar to the commit [69d61a9](https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2) in the main branch

In [1]:
import pickle
import yaml
import pandas as pd
import altair as alt

In [40]:
def get_report(response):
    report = []
    for result in response.call_results:
        if result.parsed_response:
            resp = result.parsed_response['results']
            for item in resp:
                item['file'] = result.files_evaluated[0] 
                item['success'] = result.success
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path):
    #print(resp_path)
    with open(resp_path, 'rb') as file:
        response = pickle.load(file)
    report = get_report(response)
    df = (
        report
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
    )
    df['success'] = report.groupby(['file'])['success'].all()
    df['response_path'] = resp_path
    return df.reset_index()

def generate_stat_plot(df_repo__stat, repo, ground_truth=None):
    # the base chart
    base = alt.Chart(df_repo__stat.query(f'repo == "{repo}"')).transform_calculate(
        min="max(0, datum.mean-datum.std)",
        max="min(1, datum.mean+datum.std)"
    )
    
    # generate the points
    points = base.mark_point(
        filled=True,
        size=50,
        color='black'
    ).encode(
        x=alt.X('id:O').axis(labelAngle=0).title('Checklist Id'),
        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
    )
    
    # generate the error bars
    errorbars = base.mark_errorbar().encode(
        x="id:O",
        y=alt.Y("min:Q").title('1 SD'),
        y2="max:Q"
    )

    plot = points + errorbars
    
    if ground_truth is not None:
        # generate points of ground truth
        gt_points = alt.Chart(ground_truth).mark_point(
            filled=True,
            size=100,
            color='green',
            shape="diamond"
        ).encode(
            x=alt.X('id:O'),
            y=alt.Y('score:Q')
        )

        plot += gt_points

    return plot.properties(width=400)

### Scores by file, by run, by repo

In [41]:
with open('../data/processed/batch_run/record_combine.yml', 'r') as file:
    config = pd.DataFrame(yaml.safe_load(file))
#config['response_path'].iloc[0]

tmp = [
    extract_file_and_scores(path) for path in config['response_path']
]
tmp = pd.concat(tmp, axis=0).reset_index(drop=True)

df_repo_run_file = config.merge(tmp, on='response_path', how='left')
df_repo_run_file

Unnamed: 0,repo,response_path,run,file,2.1,3.2,3.5,4.2,5.3,6.1,6.2,success
0,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_api.py,0.0,0.5,0.5,0.0,0.5,0.0,0.0,True
1,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_cross_va...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,True
2,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_data.py,0.5,0.5,0.0,0.0,0.0,0.0,0.0,True
3,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_datasets.py,1.0,0.5,0.0,0.5,0.0,0.0,0.0,True
4,lightfm,../data/processed/batch_run/lightfm_01.pickle,1,../data/raw/openja/lightfm/tests/test_evaluati...,0.0,0.0,0.0,0.5,0.0,0.5,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
6047,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/coco...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
6048,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/musi...,0.0,,,,,,,False
6049,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/onse...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,True
6050,magenta,../data/processed/batch_run/magenta_30.pickle,30,../data/raw/openja/magenta/magenta/models/scor...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True


In [42]:
# FIXME: clean non-test files (mainly in qlib)
df_repo_run_file = df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')

In [43]:
df_repo_run_file.success.all()

False

In [52]:
#df_repo_run_file[df_repo_run_file.success]['file'].unique()

### Scores by run, by repo

In [45]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

In [46]:
df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
    id: ['max'] for id in checklist_ids
})
df_repo_run.columns = [col[0] for col in df_repo_run.columns]
df_repo_run = df_repo_run.reset_index()
df_repo_run

Unnamed: 0,repo,run,2.1,3.2,3.5,4.2,5.3,6.1,6.2
0,lightfm,1,1.0,0.5,0.5,0.5,0.5,1.0,1.0
1,lightfm,2,1.0,0.5,0.0,0.5,0.0,0.5,1.0
2,lightfm,3,1.0,0.5,1.0,0.5,0.5,0.5,0.5
3,lightfm,4,1.0,0.5,1.0,0.5,0.5,1.0,1.0
4,lightfm,5,1.0,0.5,0.0,0.5,0.0,0.5,1.0
...,...,...,...,...,...,...,...,...,...
145,qlib,26,1.0,1.0,0.0,0.5,0.0,0.5,0.5
146,qlib,27,1.0,1.0,0.0,0.5,0.5,0.5,0.5
147,qlib,28,1.0,1.0,0.0,0.5,0.0,0.5,0.5
148,qlib,29,1.0,0.5,0.0,0.5,0.0,0.5,0.5


### stat(Score) by repo

In [48]:
df_repo__stat = df_repo_run.groupby(['repo']).agg({
    id: ['mean', 'std', 'count'] for id in checklist_ids
})

df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
df_repo__stat = (
    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)
df_repo__stat

Unnamed: 0,repo,id,count,mean,std
0,lightfm,2.1,30.0,0.983333,0.091287
1,lightfm,3.2,30.0,0.483333,0.091287
2,lightfm,3.5,30.0,0.266667,0.38804
3,lightfm,4.2,30.0,0.55,0.152564
4,lightfm,5.3,30.0,0.2,0.249136
5,lightfm,6.1,30.0,0.666667,0.239732
6,lightfm,6.2,30.0,0.933333,0.172873
7,magenta,2.1,30.0,0.6,0.423451
8,magenta,3.2,30.0,0.416667,0.189525
9,magenta,3.5,30.0,0.283333,0.252003


In [49]:
lightfm_gt = pd.DataFrame([
    {'id': '2.1', 'score': 1},
    {'id': '3.2', 'score': 1},
    {'id': '3.5', 'score': 0},
    {'id': '4.2', 'score': 1},
    {'id': '5.3', 'score': 0.5},
    {'id': '6.1', 'score': 1},
    {'id': '6.2', 'score': 1},
])

In [50]:
generate_stat_plot(df_repo__stat, "lightfm", lightfm_gt)

In [51]:
generate_stat_plot(df_repo__stat, "qlib")

In [53]:
generate_stat_plot(df_repo__stat, "mmf")

In [54]:
generate_stat_plot(df_repo__stat, "nanodet")

In [55]:
generate_stat_plot(df_repo__stat, "magenta")

In [None]:
#generate_stat_plot(df_repo__stat, "qlib") # before filtering

### count(Score) by repo

In [None]:
from collections import Counter
df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
for id in checklist_ids[1:]:
    df_repo__count = df_repo__count.merge(
        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
        on=['repo', 'level_1'],
        how='outer'
    )

df_repo__count = df_repo__count.fillna(0)
df_repo__count #.query('repo == "lightfm"')

In [6]:
with open("../data/processed/batch_run/magenta_01.pickle", 'rb') as file:
    response = pickle.load(file)

pd.DataFrame(response.call_results[0].parsed_response['results'])

Unnamed: 0,ID,Title,Requirement,Observation,Functions,Evaluation,Score
0,2.1,Ensure Data File Loads as Expected,Ensure that data-loading functions correctly f...,The code does not contain any data-loading fun...,[],Not Satisfied,0
1,3.2,Data in the Expected Format,Verify that the data matches the expected form...,The code does not contain any data format veri...,[],Not Satisfied,0
2,3.5,Check for Duplicate Records in Data,Verify that there are no duplicate records in ...,The code does not include any checks for dupli...,[],Not Satisfied,0
3,4.2,Verify Data Split Proportion,Check that the data is split into training and...,No data splitting or proportion verification f...,[],Not Satisfied,0
4,5.3,Ensure Model Output Shape Aligns with Expectation,Ensure that the structure of the model's outpu...,There is no code related to verifying the mode...,[],Not Satisfied,0
5,6.1,Verify Evaluation Metrics Implementation,Verify that the evaluation metrics are correct...,The code does not contain any implementation o...,[],Not Satisfied,0
6,6.2,Evaluate Model's Performance Against Thresholds,Compute evaluation metrics for both the traini...,No code is present for evaluating the model's ...,[],Not Satisfied,0


In [23]:
response.call_results[2].parsed_response['results'][0]

{'ID': '2.1',
 'Title': 'Ensure Data File Loads as Expected',
 'Requirement': 'Ensure that data-loading functions correctly fetch datasets from predefined sources or online repositories. Additionally, verify that the functions handle errors or edge cases gracefully.',
 'Observation': 'The code does not directly address data-loading functions or error handling.',
 'Functions': [],
 'Evaluation': 'Not Satisfied',
 'Score': 0}