#### NOTE: the result is based on the code base [abb9a21](https://github.com/UBC-MDS/test-creation/pull/136/commits/abb9a21828cb257ff8f2629261dc1ad64ad7dcb0), which is similar to the commit [69d61a9](https://github.com/UBC-MDS/test-creation/tree/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2) in the main branch

In [None]:
import scipy
import pickle
import yaml
import pandas as pd
import altair as alt
from collections import Counter

In [None]:
def get_report(response):
    report = []
    for result in response.call_results:
        if result.parsed_response:
            resp = result.parsed_response['results']
            for item in resp:
                item['file'] = result.files_evaluated[0] 
                item['success'] = result.success
                report.append(item)
        else:
            report.append({
                'ID': '2.1', # FIXME
                'Title': '',
                'Requirement': '',
                'Observation': '',
                'Functions': [],
                'Evaluation': '',
                'Score': 0,
                'file': result.files_evaluated[0],
                'success': result.success
            })
    return pd.DataFrame(report)

def extract_file_and_scores(resp_path, verbose=False):
    if verbose:
        print(resp_path)
    with open(resp_path, 'rb') as file:
        response = pickle.load(file)
    report = get_report(response)
    df = (
        report
        .pivot(index='file', columns='ID', values='Score')
        .rename_axis(None, axis=1)
    )
    df['success'] = report.groupby(['file'])['success'].all()
    df['response_path'] = resp_path
    return df.reset_index()

def generate_stat_plot(df_repo__stat, repo, ground_truth=None):
    # the base chart
    base = alt.Chart(df_repo__stat.query(f'repo == "{repo}"')).transform_calculate(
        min="max(0, datum.mean-datum.std)",
        max="min(1, datum.mean+datum.std)"
    )
    
    # generate the points
    points = base.mark_point(
        filled=True,
        size=50,
        color='black'
    ).encode(
        x=alt.X('id:O').axis(labelAngle=0).title('Checklist Id'),
        y=alt.Y('mean:Q').scale(domainMin=0, domainMax=1).title('Score'),
    )
    
    # generate the error bars
    errorbars = base.mark_errorbar().encode(
        x="id:O",
        y=alt.Y("min:Q").title('1 SD'),
        y2="max:Q"
    )

    plot = points + errorbars
    
    if ground_truth is not None:
        # generate points of ground truth
        gt_points = alt.Chart(ground_truth).mark_point(
            filled=True,
            size=100,
            color='green',
            shape="diamond"
        ).encode(
            x=alt.X('id:O'),
            y=alt.Y('score:Q')
        )

        plot += gt_points

    return plot.properties(width=400)

### preprocess data

In [None]:
checklist_ids = ['2.1', '3.2', '3.5', '4.2', '5.3', '6.1', '6.2']

with open('../data/processed/batch_run/record_combine.yml', 'r') as file:
    config = pd.DataFrame(yaml.safe_load(file))

# prepare score data by repo, run, file
tmp = [
    extract_file_and_scores(path) for path in config['response_path']
]
tmp = pd.concat(tmp, axis=0).reset_index(drop=True)

raw_df_repo_run_file = config.merge(tmp, on='response_path', how='left')

In [None]:
# filter non-test files in qlib
df_repo_run_file = raw_df_repo_run_file.query('(repo != "qlib") | (file.str.contains("../data/raw/openja/qlib/tests/"))')

# prepare score data by repo, run
df_repo_run = df_repo_run_file.groupby(['repo', 'run']).agg({
    id: ['max'] for id in checklist_ids
})
df_repo_run.columns = [col[0] for col in df_repo_run.columns]
df_repo_run = df_repo_run.reset_index()

# prepare statistics of scores by repo
df_repo__stat = df_repo_run.groupby(['repo']).agg({
    id: ['mean', 'std', 'count'] for id in checklist_ids
})
df_repo__stat = pd.melt(df_repo__stat.reset_index(), id_vars=[('repo', '')])
df_repo__stat.columns = ['repo', 'id', 'stat', 'value']
df_repo__stat = (
    df_repo__stat.pivot(index=['repo', 'id'], columns='stat', values='value')
    .reset_index()
    .rename_axis(None, axis=1)
)

# prepare counting of scores by repo
df_repo__count = df_repo_run.groupby(['repo'])['2.1'].apply(Counter).reset_index()
for id in checklist_ids[1:]:
    df_repo__count = df_repo__count.merge(
        df_repo_run.groupby(['repo'])[id].apply(Counter).reset_index(),
        on=['repo', 'level_1'],
        how='outer'
    )

df_repo__count = df_repo__count.fillna(0)

### Runs Quality

#### 1. Some non-test files are included in the evaluation

For example, the `./nanodet/nanodet/trainer/task.py`

In [None]:
raw_df_repo_run_file.query('repo == "nanodet"')['file'].unique()[:3]

#### 2. Evaluation on the file `magenta/magenta/models/music_vae/data_test.py` is always failed

In [None]:
df_repo_run_file[~df_repo_run_file.success]['file'].unique()

#### 3. `DeepSpeech`, `lightfm` and `magenta` have the least (Python) test files

In [None]:
df_repo_run_file.query('run == 1').groupby(['repo'])['file'].count().reset_index()

#### 4. The test files are not always in a `tests/` folder. Is it be good practice to always do that? Should it be one of the checklist item to ensure all tests placed under `tests/` folder?

For example, `magenta`

In [None]:
df_repo_run_file.query('repo == "magenta"')['file'].unique()

### Findings on 8 repos

In [None]:
df_repo_run_file.repo.unique()

#### 1. Overview of accuracy and consistency `lightfm` evaluation

Let the ground truth of the `lightfm` is as the [following](https://github.com/UBC-MDS/test-creation/blob/69d61a9f5ac62baefca23ee293a6cd09fe41eeb2/report/repo_human_evaluation/human_evaluation_report-lightfm.md):

In [None]:
# Ground truth
lightfm_gt = pd.DataFrame([
    {'id': '2.1', 'score': 1},
    {'id': '3.2', 'score': 1},
    {'id': '3.5', 'score': 0},
    {'id': '4.2', 'score': 1},
    {'id': '5.3', 'score': 0.5},
    {'id': '6.1', 'score': 1},
    {'id': '6.2', 'score': 1},
])
lightfm_gt

In [None]:
generate_stat_plot(df_repo__stat, "lightfm", lightfm_gt)

The distribution of the scores for each checklist items:

In [None]:
df_repo__count.query('repo == "lightfm"')

**Observations**:
The system evaluation kind of aligns with our evaluation, that is,
 - for those items that we believe "Satisfied" (Score = 1), the system mostly output 0.5 or 1
 - for those items that we believe "Partially Satisfied" or "Not Satisfied", the system mostly output 0.5 or 0

#### 2. Overview of `qlib`
Let the ground truth of the `qlib` is as the following (FIXME: to be confirmed):

In [None]:
# Ground truth
qlib_gt = pd.DataFrame([
    {'id': '2.1', 'score': 0.5},
    {'id': '3.2', 'score': 1},
    {'id': '3.5', 'score': 0},
    {'id': '4.2', 'score': 0},
    {'id': '5.3', 'score': 1},
    {'id': '6.1', 'score': 1},
    {'id': '6.2', 'score': 1},
])
qlib_gt

In [None]:
generate_stat_plot(df_repo__stat, "qlib", qlib_gt)

In [None]:
df_repo__count.query('repo == "qlib"')

**Observations**: There are more disagreement between system and manual evaluation.

#### 3. The consistency for each checklist items
 - Why is it important?
   If the score of a particular item varies a lot when evaluating a repository, it might mean that its prompt (`Requirement`) is confusing to the LLM, or the checklist item itself is not well defined.

In [None]:
alt.Chart(df_repo__stat).mark_boxplot().encode(
    x="std:Q",
    y='id:N'
).properties(
    height=200,
    width=400
)

**Observations**:
 - The evaluation of the checklist item 2.1 `Ensure Data File Loads as Expected` is usually stable. When evaluating a repository, 50% of the time its standard deviation is smaller than 0.05, the smallest among the others.

#### 4. The consistency for each checklist items, compared to the `lightfm`
 - Why is it important? We optimized the consistency of our system using `lightfm`. Therefore, we treat this repository as a benchmark. If a particular checklist item has a much worse consistency in other repository, that might mean that the prompt for that item is not generalizable.

Below shows the standard deviations in a 30 runs for each checklist item for each repository:

In [None]:
stds = df_repo__stat[['repo', 'std', 'id']].pivot(index='repo', columns='id')
stds

In [None]:
F = stds.drop(index='lightfm') / stds.loc['lightfm']

base = alt.Chart(
    F.melt(ignore_index=False).reset_index()[['repo', 'id', 'value']]
).transform_calculate(
    benchmark="1",
    threshold=f"{scipy.stats.f.ppf(0.975, 29, 29)}"
)

point = base.mark_point(
    filled=True,
    size=100,
).encode(
    x=alt.X('value:Q').title("std ratio (c.f. lightfm)"),
    y='id:N',
    color='repo',
    tooltip='repo'
).properties(
    height=200,
    width=400
)

point \
+ base.mark_rule(color='black').encode(x="benchmark:Q") \
+ base.mark_rule(color='red').encode(x="threshold:Q")

**Observations**:
 - The evaluation of the checklist item 3.2 `Data in the Expected Format` becomes much more unstable in most of other repositories.
 - That of the 2.1 is significantly unstable in the repo `paperless-ng`, `magenta` and `DeepSpeech`, but it may be due to the repo itself.

TODO: to look into the 3.2's scores.

#### TODO: Given ground truth == 1, distribution of system score?
#### TODO: Given ground truth == 0, distribution of system score?