#### NOTE: the result is based on the code base [abb9a21](https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0), which is similar to the commit [69d61a9](https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2) in the main branch

In [1]:
import scipy
import pickle
import yaml
import pandas as pd
import altair as alt
from collections import Counter

In [2]:
def get_report(response):
    report = []
    for result in response.call_results:
        if result.parsed_response:
            resp = result.parsed_response['results']
            for item in resp:
                item['file'] = result.files_evaluated[0] 
                item['success'] = result.success
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path, verbose=False):
    if verbose:
        print(resp_path)
    with open(resp_path, 'rb') as file:
        response = pickle.load(file)
    report = get_report(response)
    df = (
        report
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
    )
    df['success'] = report.groupby(['file'])['success'].all()
    df['response_path'] = resp_path
    return df.reset_index()

def generate_stat_plot(df_repo__stat, ground_truth=None, facet_col='repo', repo=None, id=None):
    """
    Generate Stat plot across all repo and all checklist item
    Optional to incorporate ground truth and select specific repo/checklist item
    """
    if facet_col == 'repo':
        x_col = 'id'
        x_title = 'Checklist ID'
    elif facet_col == 'id':
        x_col = 'repo'
        x_title = 'Repository'
    
    # the base chart
    if repo:
        df_repo__stat = df_repo__stat.query(f'repo == "{repo}"')
    if id:
        df_repo__stat = df_repo__stat.query(f'id == "{id}"')
    
    base = alt.Chart().transform_calculate(
        min="max(0, datum.mean-datum.std)",
        max="min(1, datum.mean+datum.std)"
    )
    
    # generate the points
    points = base.mark_point(
        filled=True,
        size=50,
        color='black'
    ).encode(
        x=alt.X(f'{x_col}:O').axis(labelAngle=0).title(x_title),
        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
    )
    
    # generate the error bars
    errorbars = base.mark_errorbar().encode(
        x=f"{x_col}:O",
        y=alt.Y("min:Q").title('1 SD'),
        y2="max:Q"
    )

    plot = points + errorbars
    
    if ground_truth is not None:
        # generate points of ground truth
        if repo:
            ground_truth = ground_truth.query(f'repo == "{repo}"')
        if id:
            ground_truth = ground_truth.query(f'id == "{id}"')
        
        df_repo__stat = pd.merge(df_repo__stat, ground_truth, how='left', on=['repo', 'id'])
        
        gt_points = alt.Chart().mark_point(
            filled=True,
            size=100,
            color='green',
            shape="diamond"
        ).encode(
            x=alt.X(f'{x_col}:O'),
            y=alt.Y('score:Q')
        )

        plot += gt_points

    plot = alt.layer(
                plot,
                data=df_repo__stat
            ).properties(
                width=400,
            ).facet(
                column=f'{facet_col}',
                columns=2
            )

    return plot

### preprocess data

In [4]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

result_path = '../data/processed/batch_run/record_combine.yml'
with open(result_path, 'r') as file:
    config = pd.DataFrame(yaml.safe_load(file))

# prepare score data by repo, run, file
tmp = [
    extract_file_and_scores(path) for path in config['response_path'] # FIXME: excluded deepchem
]
tmp = pd.concat(tmp, axis=0).reset_index(drop=True)

raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')

In [5]:
# filter non-test files in qlib
df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')

# prepare score data by repo, run
df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
    id: ['max'] for id in checklist_ids
})
df_repo_run.columns = [col[0] for col in df_repo_run.columns]
df_repo_run = df_repo_run.reset_index()

# prepare statistics of scores by repo
df_repo__stat = df_repo_run.groupby(['repo']).agg({
    id: ['mean', 'std', 'count'] for id in checklist_ids
})
df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
df_repo__stat = (
    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)

# prepare counting of scores by repo
df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
for id in checklist_ids[1:]:
    df_repo__count = df_repo__count.merge(
        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
        on=['repo', 'level_1'],
        how='outer'
    )

df_repo__count = df_repo__count.fillna(0)

### Runs Quality

#### 1. Some non-test files are included in the evaluation

For example, the `./nanodet/nanodet/trainer/task.py`

In [6]:
raw_df_repo_run_file.query('repo == "nanodet"')['file'].unique()[:3]

array(['../data/raw/openja/nanodet/nanodet/trainer/task.py',
       '../data/raw/openja/nanodet/tests/test_configs/test_config.py',
       '../data/raw/openja/nanodet/tests/test_data/test_batch_process.py'],
      dtype=object)

#### 2. Evaluation on the file `magenta/magenta/models/music_vae/data_test.py` is always failed

In [7]:
df_repo_run_file[~df_repo_run_file.success]['file'].unique()

array(['../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
       '../data/raw/openja/paperless-ng/src/documents/tests/test_api.py'],
      dtype=object)

#### 3. `DeepSpeech`, `lightfm` and `magenta` have the least (Python) test files

In [8]:
df_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()

Unnamed: 0,repo,file
0,DeepSpeech,3
1,apollo,14
2,lightfm,7
3,magenta,8
4,mmf,70
5,mycroft-core,64
6,nanodet,42
7,paperless-ng,35
8,qlib,31


#### 4. The test files are not always in a `tests/` folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under `tests/` folder?

For example, `magenta`

In [9]:
df_repo_run_file.query('repo == "magenta"')['file'].unique()

array(['../data/raw/openja/magenta/conftest.py',
       '../data/raw/openja/magenta/magenta/common/state_util_test.py',
       '../data/raw/openja/magenta/magenta/models/coconet/export_saved_model_test.py',
       '../data/raw/openja/magenta/magenta/models/coconet/lib_data.py',
       '../data/raw/openja/magenta/magenta/models/music_vae/data_test.py',
       '../data/raw/openja/magenta/magenta/models/onsets_frames_transcription/create_dataset_lib_test.py',
       '../data/raw/openja/magenta/magenta/models/score2perf/datagen_beam_test.py',
       '../data/raw/openja/magenta/magenta/pipelines/pipeline_test.py'],
      dtype=object)

### Findings on 8 repos

In [10]:
df_repo_run_file.repo.unique()

array(['lightfm', 'qlib', 'mmf', 'nanodet', 'magenta', 'DeepSpeech',
       'paperless-ng', 'mycroft-core', 'apollo'], dtype=object)

#### 1. Overview of accuracy and consistency `lightfm` evaluation

Let the ground truth of the `lightfm` is as the [following](https://github.com/UBC-MDS/test-creation/blob/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2/report/repo_human_evaluation/human_evaluation_report-lightfm.md):

In [11]:
# Ground truth
ground_truth = pd.DataFrame([
    {'repo': 'lightfm', 'id': '2.1', 'score': 1},
    {'repo': 'lightfm', 'id': '3.2', 'score': 1},
    {'repo': 'lightfm', 'id': '3.5', 'score': 0},
    {'repo': 'lightfm', 'id': '4.2', 'score': 1},
    {'repo': 'lightfm', 'id': '5.3', 'score': 0.5},
    {'repo': 'lightfm', 'id': '6.1', 'score': 1},
    {'repo': 'lightfm', 'id': '6.2', 'score': 1},
    {'repo': 'qlib', 'id': '2.1', 'score': 0.5},
    {'repo': 'qlib', 'id': '3.2', 'score': 1},
    {'repo': 'qlib', 'id': '3.5', 'score': 0},
    {'repo': 'qlib', 'id': '4.2', 'score': 0.5},
    {'repo': 'qlib', 'id': '5.3', 'score': 1},
    {'repo': 'qlib', 'id': '6.1', 'score': 1},
    {'repo': 'qlib', 'id': '6.2', 'score': 1},
])
ground_truth[ground_truth.repo == 'lightfm']

Unnamed: 0,repo,id,score
0,lightfm,2.1,1.0
1,lightfm,3.2,1.0
2,lightfm,3.5,0.0
3,lightfm,4.2,1.0
4,lightfm,5.3,0.5
5,lightfm,6.1,1.0
6,lightfm,6.2,1.0


In [12]:
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo="lightfm", facet_col='repo')

The distribution of the scores for each checklist items:

In [13]:
df_repo__count.query('repo == "lightfm"')

Unnamed: 0,repo,level_1,2.1,3.2,3.5,4.2,5.3,6.1,6.2
6,lightfm,0.0,0.0,1.0,19.0,0.0,18.0,0.0,0.0
7,lightfm,0.5,1.0,29.0,6.0,27.0,12.0,20.0,4.0
8,lightfm,1.0,29.0,0.0,5.0,3.0,0.0,10.0,26.0


**Observations**:
The system evaluation kind of aligns with our evaluation, that is,
 - for those items that we believe "Satisfied" (Score = 1), the system mostly output 0.5 or 1
 - for those items that we believe "Partially Satisfied" or "Not Satisfied", the system mostly output 0.5 or 0
 - some checklist items display high variance, e.g. 3.5, 5.3 and 6.1.

#### 2. Overview of `qlib`
Let the ground truth of the `qlib` is as the following (FIXME: to be confirmed):

In [14]:
# Ground truth
ground_truth[ground_truth.repo == 'qlib']

Unnamed: 0,repo,id,score
7,qlib,2.1,0.5
8,qlib,3.2,1.0
9,qlib,3.5,0.0
10,qlib,4.2,0.5
11,qlib,5.3,1.0
12,qlib,6.1,1.0
13,qlib,6.2,1.0


In [15]:
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, repo="qlib", facet_col='repo')

In [16]:
df_repo__count.query('repo == "qlib"')

Unnamed: 0,repo,level_1,2.1,3.2,3.5,4.2,5.3,6.1,6.2
24,qlib,0.0,0.0,1.0,29.0,3.0,14.0,4.0,1.0
25,qlib,0.5,0.0,12.0,1.0,27.0,16.0,24.0,26.0
26,qlib,1.0,30.0,17.0,0.0,0.0,0.0,2.0,3.0


**Observations**: 
- There are more disagreement between system and manual evaluation
  - especially for 5.3, 6.1, 6.2.
- The items consistency in this repo are not similar to those in `lightfm`.
  - e.g. Variance for 3.5 is greatly reduced. Variance for 3.2 becomes larger.
- However, `qlib` is not just a machine learning project, it also contains a software inside.
  - e.g. It has a lot of randomly generated data by itself, instead of reading a data to perform analysis, it seems to deviate from the objective of 2.1.

#### 3. The consistency for each checklist items
 - Why is it important?
   If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (`Requirement`) is confusing to the LLM, or the checklist item itself is not well defined.

In [17]:
df_repo__stat.pivot(index='id', columns='repo', values='std')

repo,DeepSpeech,apollo,lightfm,magenta,mmf,mycroft-core,nanodet,paperless-ng,qlib
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2.1,0.479463,0.152564,0.091287,0.423451,0.0,0.0,0.0,0.242117,0.0
3.2,0.406838,0.215092,0.091287,0.189525,0.245066,0.278027,0.239732,0.091287,0.285673
3.5,0.0,0.0,0.38804,0.252003,0.126854,0.0,0.252003,0.0,0.091287
4.2,0.0,0.0,0.152564,0.091287,0.126854,0.0,0.254274,0.0,0.152564
5.3,0.0,0.0,0.249136,0.0,0.126854,0.0,0.0,0.0,0.253708
6.1,0.351107,0.172873,0.239732,0.252003,0.233046,0.0,0.285673,0.0,0.224888
6.2,0.0,0.0,0.172873,0.0,0.201289,0.253708,0.260415,0.126854,0.182574


In [18]:
alt.Chart(df_repo__stat).mark_boxplot().encode(
    x="std:Q",
    y='id:N'
).properties(
    height=200,
    width=400
)

**Observations**:
 - The evaluation of the checklist item 2.1 `Ensure Data File Loads as Expected` is usually stable.
   - When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.

Below shows the breakdown of item scores for each repository:  
(NOTE: only `lightfm` and `qlib` have ground truth, in green diamond)

In [19]:
generate_stat_plot(df_repo__stat, ground_truth=ground_truth, facet_col='id')

**Observations**:
 - (TBC) The standard deviation for Item 3.5 and 5.3 shows great variation, which might imply that test cases in some repo might be confusing to LLM while some are clear.
 - (TBC) The standard deviation for Item 5.3, 6.1, 6.2 are relatively high and consistent, which might imply that there is room for refining the prompt to reduce consistency issue.

#### 4. The consistency for each checklist items, compared to the `lightfm`
 - Why is it important? We optimized the consistency of our system using `lightfm`. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.

Below shows the standard deviations in a 30 runs for each checklist item for each repository:

In [20]:
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')
stds

Unnamed: 0_level_0,std,std,std,std,std,std,std
id,2.1,3.2,3.5,4.2,5.3,6.1,6.2
repo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
DeepSpeech,0.479463,0.406838,0.0,0.0,0.0,0.351107,0.0
apollo,0.152564,0.215092,0.0,0.0,0.0,0.172873,0.0
lightfm,0.091287,0.091287,0.38804,0.152564,0.249136,0.239732,0.172873
magenta,0.423451,0.189525,0.252003,0.091287,0.0,0.252003,0.0
mmf,0.0,0.245066,0.126854,0.126854,0.126854,0.233046,0.201289
mycroft-core,0.0,0.278027,0.0,0.0,0.0,0.0,0.253708
nanodet,0.0,0.239732,0.252003,0.254274,0.0,0.285673,0.260415
paperless-ng,0.242117,0.091287,0.0,0.0,0.0,0.0,0.126854
qlib,0.0,0.285673,0.091287,0.152564,0.253708,0.224888,0.182574


In [21]:
stds_p = stds.copy()
stds_p.columns = [col[1] for col in stds_p.columns]
stds_p = stds_p.reset_index()
stds_p = stds_p.melt(id_vars='repo', var_name='id')

In [22]:
stds_p.head()

Unnamed: 0,repo,id,value
0,DeepSpeech,2.1,0.479463
1,apollo,2.1,0.152564
2,lightfm,2.1,0.091287
3,magenta,2.1,0.423451
4,mmf,2.1,0.0


In [23]:
# stripplot = (
#     alt.Chart(stds_p)
#     .mark_point(filled=True, size=100)
#     .transform_calculate( 
#         # Generate Gaussian jitter with a Box-Muller transform 
#         jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
#         # jitter='random()'
#     ).encode( 
#         y=alt.Y( 
#             'jitter:Q', 
#             title=None, 
#             axis=alt.Axis(ticks=False, grid=True, labels=False), 
#             scale=alt.Scale(), 
#         ), 
#         x=alt.X('value:Q'), 
#         color=alt.Color('repo:N'),
#         row=alt.Row( 
#             'id:N',
#             header=alt.Header(
#                 labelFontSize=16,
#                 labelAngle=0
#             )
#         ),
#         tooltip='repo'
#     ).configure_facet( 
#         spacing=0
#     ).configure_view( 
#         stroke=None
#     ).configure_axis( 
#         labelFontSize=16, 
#         titleFontSize=16
#     ).properties(
#         height=50, 
#         width=600
#     ) 
# )
    
# stripplot 

In [42]:
def generate_jitterbox_plot(df_stds_p):
    """
    Generate jitterbox plot across all repo and all checklist item
    """
    box = alt.Chart().mark_boxplot(
        color='grey',
        opacity=0.5,
        size=20,
    ).encode(
        x=alt.X('value:Q').title('SD(Score)'),
        y=alt.Y('id:N', title=None, axis=alt.Axis(labelPadding=10, grid=False))
    )
    
    stripplot = alt.Chart().mark_circle(size=100).encode(
        y=alt.Y( 
            'id:N',
            axis=alt.Axis(ticks=False, grid=True, labels=True), 
            scale=alt.Scale(), 
        ), 
        x='value:Q',
        yOffset="jitter:Q",
        color=alt.Color('id:N', legend=None),
        tooltip='repo'
    ).transform_calculate(
        # Generate Gaussian jitter with a Box-Muller transform
        jitter="sqrt(-2*log(random()))*cos(2*PI*random())"
    )
    
    plot = alt.layer(
        box,
        stripplot,
        data=df_stds_p
    ).configure_view( 
        stroke=None
    ).configure_axis( 
        labelFontSize=16, 
        titleFontSize=16
    ).properties(
        height=300, 
        width=600
    ) 
    
    return plot

In [43]:
generate_jitterbox_plot(stds_p)

In [None]:
alt.Chart(df_repo__stat).mark_boxplot().encode(
    x="std:Q",
    y='id:N'
).properties(
    height=200,
    width=400
)

In [None]:
# !pip install altair_catplot
# !pip install seaborn

In [None]:
# import altair_catplot

# altair_catplot.catplot(
#     stds_p, 
#     transform ='jitterbox', 
#     mark ='point', 
#     encoding = dict(
#         x = alt.X('value:Q'), 
#         y = alt.Y('id:N'), 
#         color = alt.Color('repo:N')
#     ) 
# )

In [None]:
F = stds.drop(index='lightfm') / stds.loc['lightfm']

base = alt.Chart(
    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]
).transform_calculate(
    benchmark="1",
    threshold=f"{scipy.stats.f.ppf(0.975, 29, 29)}"
)

point = base.mark_point(
    filled=True,
    size=100,
).encode(
    x=alt.X('value:Q').title("std ratio (c.f. lightfm)"),
    y='id:N',
    color='repo',
    tooltip='repo'
).properties(
    height=200,
    width=400
)

point \
+ base.mark_rule(color='black').encode(x="benchmark:Q") \
+ base.mark_rule(color='red').encode(x="threshold:Q")
# jitter instead of mark_point <-- prompt vs. repo problem?
# prompt: sd of checklist item for all repo is high
# repo: most of repo have low sd, the repo we're looking at has outlier

**Observations**:
 - The evaluation of the checklist item 3.2 `Data in the Expected Format` becomes much more unstable in most of other repositories.
 - That of the 2.1 is significantly unstable in the repo `paperless-ng`, `magenta` and `DeepSpeech`, but it may be due to the repo itself.

TODO: to look into the 3.2's scores.

#### TODO: Given ground truth == 1, distribution of system score?
#### TODO: Given ground truth == 0, distribution of system score?

In [None]:
def generate_histogram_plot(df_repo_run_long, df_ground_truth=None, repo=None, id=None):
    """
    Generate histogram across all repo and all checklist item
    Optional to incorporate ground truth and select specific repo/checklist item
    """
    # data
    repo_data = df_repo_run_long.copy()
    if repo:
        repo_data = repo_data.query(f'repo == "{repo}"')
    if id:
        repo_data = repo_data.query(f'id == "{id}"')

    # base histogram chart
    base = alt.Chart().mark_bar().encode(
                x=alt.X('eval_score:Q', title='Score'), 
                y=alt.Y('count()'), 
                color=alt.value('grey'),
                size=alt.value(20),
            )
    
    if df_ground_truth is not None:
        # data
        gt_data = df_ground_truth.copy()
        if repo:
            gt_data = gt_data.query(f'repo == "{repo}"')
        if id:
            gt_data = gt_data.query(f'id == "{id}"')
        
        repo_data = pd.merge(repo_data, gt_data, how='left', on=['repo', 'id'])
        repo_data['is_equal_to_gt'] = repo_data['eval_score'] == repo_data['score']
        
        # base histogram chart
        base = base.encode(
                    color=alt.Color('is_equal_to_gt', scale=alt.Scale(range=['grey', 'green']), legend=None)
                )
        base += base.mark_text().encode(
            text=alt.value('Ground Truth'),
            x='score',
            size=alt.value(10),
            color=alt.value('green'),
        )

    plot = alt.layer(
                base,
                data=repo_data
            ).properties(
                width=200,
                height=200,
            ).facet(
                row='repo',
                column='id'
            )        
    
    return plot

In [None]:
df_repo_run_long = df_repo_run.melt(
    id_vars=['repo', 'run'],
    var_name='id',
    value_name='eval_score',
)

In [None]:
generate_histogram_plot(df_repo_run_long, df_ground_truth=ground_truth)