## Advanced usage

##### 🌠 Import Evalverse

In [1]:
import evalverse as ev

evaluator = ev.Evaluator()
print("evalverse version:", ev.__version__)

evalverse version: 0.0.1


##### 🌠 Check the number of GPUs

In [2]:
import torch

num_gpus = torch.cuda.device_count()
print("num_gpus:", num_gpus)

num_gpus: 8


##### 🌠 Common Arguments

In [3]:
# Common args
model = "upstage/SOLAR-10.7B-Instruct-v1.0"
output_path = "./results"

### 🌌 1. How to run H6

In [4]:
# Arguments for H6
benchmark = "h6_en"
data_parallel = num_gpus # Multi-GPU

# Run H6 evaluation
evaluator.run(
    model=model,
    benchmark=benchmark,
    output_path=output_path,
    )

[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:104] >> The value of argument "output_path" has been changed to "./results".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:115] >> The value of argument "h6_en" has been changed to "True".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:133] >> Args {'ckpt_path': 'upstage/SOLAR-10.7B-Instruct-v1.0', 'output_path': './results', 'model_name': 'SOLAR-10.7B-Instruct-v1.0', 'use_fast_tokenizer': False, 'devices': '0', 'use_flash_attention_2': False, 'h6_en': True, 'batch_size': 16, 'use_vllm': False, 'gpu_memory_utilization': 0.8, 'model_parallel': 1, 'data_parallel': 1, 'load_in_8bit': False, 'load_in_4bit': False, 'mt_bench': False, 'baselines': None, 'judge_model': 'gpt-4', 'num_gpus_total': 1, 'num_gpus_per_model': 1, 'parallel_api': 1, 'ifeval': False, 'gpu_per_inst_eval': 1, 'eq_bench': False, 'eq_bench_prompt_type': 'ChatML', 'eq_bench_lora_path': None, 'eq_bench_quantization': None}
[2024-04-02 16:14:18][INFO][evalvers

The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/arc_challenge_25.json
The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/hellaswag_10.json
The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/mmlu_5.json
The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/truthfulqa_mc2_0.json
The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/winogrande_5.json
The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/h6_en/gsm8k_5.json
ARC-Challenge (25-shot) {
    "acc,none": 0.6885665529010239,
    "acc_stderr,none": 0.01353247209985083,
    "acc_norm,none": 0.7133105802047781,
    "acc_norm_stderr,none": 0.013214986329274855,
    "alias": "arc_challenge"
}
Hellaswag (10-shot)

### 🌌 2. How to run MT-Bench

##### 🌠 Add OpenAI API Key
- `OPENAI_API_KEY` is required to run MT-Bench.

- How to add `OPENAI_API_KEY`
  - Method 1. `export OPENAI_API_KEY=<Your OpenAI API Key>` in your shell.
  - Method 2. Setup with .env file. (Copy from `.env_sample` and rename to `.env`)

In [5]:
# Check you OpenAI API Key (Warning: Do not commit!)
import os
from dotenv import load_dotenv

load_dotenv()
os.getenv("OPENAI_API_KEY")

'sk-...'

##### 🌠 Run MT-Bench

In [6]:
# Arguments for MT-Bench
benchmark = "mt_bench"
num_gpus_total = num_gpus # Multi-GPU
parallel_api = 4          # Concurrent API calls

# Run MT-Bench evaluation
evaluator.run(
    model=model,
    benchmark=benchmark,
    output_path=output_path,
    num_gpus_total=num_gpus_total,
    parallel_api=parallel_api,
    )

[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:104] >> The value of argument "output_path" has been changed to "./results".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:104] >> The value of argument "num_gpus_total" has been changed to "8".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:104] >> The value of argument "parallel_api" has been changed to "4".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:115] >> The value of argument "mt_bench" has been changed to "True".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:133] >> Args {'ckpt_path': 'upstage/SOLAR-10.7B-Instruct-v1.0', 'output_path': './results', 'model_name': 'SOLAR-10.7B-Instruct-v1.0', 'use_fast_tokenizer': False, 'devices': '0', 'use_flash_attention_2': False, 'h6_en': False, 'batch_size': 16, 'use_vllm': False, 'gpu_memory_utilization': 0.8, 'model_parallel': 1, 'data_parallel': 1, 'load_in_8bit': False, 'load_in_4bit': False, 'mt_bench': True, 'baselines': None, 'judge_model': 'gpt-4', 'nu

The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/scores.txt
Mode: single
Input file: /data/private/new_lib/evalverse/results/SOLAR-10.7B-Instruct-v1.0/mt_bench/model_judgment/gpt-4_single.jsonl

########## First turn ##########
                                  score
model                     turn         
SOLAR-10.7B-Instruct-v1.0 1     7.66875

########## Second turn ##########
                                  score
model                     turn         
SOLAR-10.7B-Instruct-v1.0 2     7.21519

########## Average ##########
                              score
model                              
SOLAR-10.7B-Instruct-v1.0  7.443396



### 🌌 3. How to run IFEval

In [7]:
# Arguments for IFEval
benchmark = "ifeval"

# Run IFEval evaluation
evaluator.run(
    model=model,
    benchmark=benchmark,
    output_path=output_path,
    )

[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:104] >> The value of argument "output_path" has been changed to "./results".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:115] >> The value of argument "ifeval" has been changed to "True".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:133] >> Args {'ckpt_path': 'upstage/SOLAR-10.7B-Instruct-v1.0', 'output_path': './results', 'model_name': 'SOLAR-10.7B-Instruct-v1.0', 'use_fast_tokenizer': False, 'devices': '0', 'use_flash_attention_2': False, 'h6_en': False, 'batch_size': 16, 'use_vllm': False, 'gpu_memory_utilization': 0.8, 'model_parallel': 1, 'data_parallel': 1, 'load_in_8bit': False, 'load_in_4bit': False, 'mt_bench': False, 'baselines': None, 'judge_model': 'gpt-4', 'num_gpus_total': 1, 'num_gpus_per_model': 1, 'parallel_api': 1, 'ifeval': True, 'gpu_per_inst_eval': 1, 'eq_bench': False, 'eq_bench_prompt_type': 'ChatML', 'eq_bench_lora_path': None, 'eq_bench_quantization': None}
[2024-04-02 16:14:18][INFO][evalver

The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/ifeval/scores.txt
/data/private/new_lib/results/SOLAR-10.7B-Instruct-v1.0/ifeval/eval_results_strict.jsonl Accuracy Scores:
prompt-level: 0.5157116451016636
instruction-level: 0.5796519410977242

change_case 0.5595238095238095
combination 0.16417910447761194
detectable_content 0.8727272727272727
detectable_format 0.6666666666666666
keywords 0.744
language 0.8
length_constraints 0.5631067961165048
punctuation 0.1791044776119403
startend 0.6060606060606061

change_case:capital_word_frequency 0.7
change_case:english_capital 0.4230769230769231
change_case:english_lowercase 0.5789473684210527
combination:repeat_prompt 0.047619047619047616
combination:two_responses 0.36
detectable_content:number_placeholders 0.8076923076923077
detectable_content:postscript 0.9310344827586207
detectable_format:constrained_response 0.9
detectable_format:json_format 0.7647058823529411
detectable_format:multiple

### 🌌 4. How to run EQ-Bench

In [8]:
# Arguments for EQ-Bench
benchmark = "eq_bench"

# Run EQ-Bench evaluation
evaluator.run(
    model=model,
    benchmark=benchmark,
    output_path=output_path,
    )

[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:104] >> The value of argument "output_path" has been changed to "./results".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:115] >> The value of argument "eq_bench" has been changed to "True".
[2024-04-02 16:14:18][INFO][evalverse - evaluator.py:133] >> Args {'ckpt_path': 'upstage/SOLAR-10.7B-Instruct-v1.0', 'output_path': './results', 'model_name': 'SOLAR-10.7B-Instruct-v1.0', 'use_fast_tokenizer': False, 'devices': '0', 'use_flash_attention_2': False, 'h6_en': False, 'batch_size': 16, 'use_vllm': False, 'gpu_memory_utilization': 0.8, 'model_parallel': 1, 'data_parallel': 1, 'load_in_8bit': False, 'load_in_4bit': False, 'mt_bench': False, 'baselines': None, 'judge_model': 'gpt-4', 'num_gpus_total': 1, 'num_gpus_per_model': 1, 'parallel_api': 1, 'ifeval': False, 'gpu_per_inst_eval': 1, 'eq_bench': True, 'eq_bench_prompt_type': 'ChatML', 'eq_bench_lora_path': None, 'eq_bench_quantization': None}
[2024-04-02 16:14:18][INFO][evalv

The result already exists: /data/private/new_lib/evalverse/examples/results/SOLAR-10.7B-Instruct-v1.0/eq_bench/raw_results.json
{
    "first_pass_score": 70.06006445118443,
    "first_pass_parseable": 169,
    "revised_score": 0,
    "revised_parseable": 0,
    "final_score": 70.06006445118443,
    "final_parseable": 169
}


### 🌌 5. How to run all benchmarks

In [None]:
# Arguments for EQ-Bench
benchmark = "all" # is same with ["h6_en", "mt_bench", "ifeval", "eq_bench"]

# Run EQ-Bench evaluation
evaluator.run(
    model=model,
    benchmark=benchmark,
    output_path=output_path,
    )