In [1]:
import pandas as pd
from config import get_system_prompt, PERSONA_PROFILE, TEST_SCENARIOS
from evaluator import ModelEvaluator

In [2]:
MODELS = [
    {
        "name": "Gemma-2-2B-It",
        "path": "./models/Gemma-2-2b-it-Q4_K_M.gguf"
    },
    {
        "name": "Llama-3.2-3B-Instruct",
        "path": "./models/Llama-3.2-3B-Instruct-Q4_K_M.gguf"
    },
    {
        "name": "Qwen2.5-3B-Instruct",
        "path": "./models/Qwen2.5-3B-Instruct-Q4_K_M.gguf"
    },
    {
        "name": "EXAONE-3.5-2.4B-Instruct",
        "path": "./models/EXAONE-3.5-2.4B-Instruct-Q4_K_M.gguf"
    }
]

In [3]:
# def main():
#     results = []
#     system_prompt = get_system_prompt(PERSONA_PROFILE)

#     for model in MODELS:
#         evaluator = ModelEvaluator(model['path'], model['name'])
#         evaluator.load_model()
        
#         if not evaluator.llm:
#             continue

#         # 시나리오별 테스트
#         print(f"▶ {model['name']} 테스트")
#         for scenario in TEST_SCENARIOS:
#             result = evaluator.generate_and_measure(system_prompt, scenario)
#             if result:
#                 results.append(result)
#                 print(f"   [Query] {scenario[:30]}...")
#                 print(f"   [Resp]  {result['response'][:30]}...")
#                 print(f"   [Perf]  TTFT: {result['ttft']}s | TPS: {result['tps']}")
        
#         # 모델 언로드 (다음 모델을 위해 메모리 비우기)
#         evaluator.unload_model()

#     # 결과 저장 및 출력
#     df = pd.DataFrame(results)
#     df = df[['model', 'ttft', 'tps', 'input', 'response']]
    
#     print("\n[테스트 결과]")
#     print(df.groupby('model')[['ttft', 'tps']].mean()) # 모델별 평균 성능
    
#     # CSV 저장
#     # df.to_csv("sllm_persona_test_result.csv", index=False, encoding='utf-8-sig')
#     # print("\nResults saved to 'sllm_persona_test_result.csv'")

In [4]:
NUM_ITERATIONS = 5  # 각 질문당 5번 반복
SKIP_COLD = True    # 첫 번째 실행 결과는 통계에서 제외

def main():
    results = []
    system_prompt = get_system_prompt(PERSONA_PROFILE)
    
    for model in MODELS:
        evaluator = ModelEvaluator(model['path'], model['name'])
        evaluator.load_model()

        print(f"▶ {model['name']} 테스트")

        # 시나리오별 반복 테스트
        for scenario_idx, scenario in enumerate(TEST_SCENARIOS):
            for i in range(NUM_ITERATIONS):
                result = evaluator.generate_and_measure(system_prompt, scenario)
                
                if result:
                    # 콜드스타트 제외 로직
                    if SKIP_COLD and i == 0:
                        continue 
                    
                    # 결과에 회차 정보 추가
                    result['iteration'] = i + 1
                    result['scenario_id'] = scenario_idx + 1
                    results.append(result)
                    
                    print(f"      [Iter {i+1}] TTFT: {result['ttft']}s | TPS: {result['tps']}")

        # 모델 언로드
        evaluator.unload_model()

    # 결과 저장 및 분석
    df = pd.DataFrame(results)
    
    print("\n[테스트 결과]")
    # 모델별 전체 평균
    summary = df.groupby('model')[['ttft', 'tps']].agg(['mean', 'std'])
    print(summary)

In [5]:
if __name__ == "__main__":
    main()

llama_model_loader: loaded meta data with 39 key-value pairs and 288 tensors from ./models/Gemma-2-2b-it-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = gemma2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Gemma 2 2b It
llama_model_loader: - kv   3:                           general.finetune str              = it
llama_model_loader: - kv   4:                           general.basename str              = gemma-2
llama_model_loader: - kv   5:                         general.size_label str              = 2B
llama_model_loader: - kv   6:                            general.license str              = gemma
llama_model_loader: - kv   7:                               general

모델 로드 중...: Gemma-2-2B-It


llama_model_loader: - kv  25:                  tokenizer.ggml.token_type arr[i32,256000]  = [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
llama_model_loader: - kv  26:                tokenizer.ggml.bos_token_id u32              = 2
llama_model_loader: - kv  27:                tokenizer.ggml.eos_token_id u32              = 1
llama_model_loader: - kv  28:            tokenizer.ggml.unknown_token_id u32              = 3
llama_model_loader: - kv  29:            tokenizer.ggml.padding_token_id u32              = 0
llama_model_loader: - kv  30:               tokenizer.ggml.add_bos_token bool             = true
llama_model_loader: - kv  31:               tokenizer.ggml.add_eos_token bool             = false
llama_model_loader: - kv  32:                    tokenizer.chat_template str              = {{ bos_token }}{% if messages[0]['rol...
llama_model_loader: - kv  33:            tokenizer.ggml.add_space_prefix bool             = false
llama_model_loader: - kv  34:               general.quantization_

❌ 모델 로드 실패: 'Llama' object has no attribute 'n_gpu_layers'
▶ Gemma-2-2B-It 테스트


llama_perf_context_print:        load time =    1682.40 ms
llama_perf_context_print: prompt eval time =    1682.18 ms /   222 tokens (    7.58 ms per token,   131.97 tokens per second)
llama_perf_context_print:        eval time =    5374.02 ms /    98 runs   (   54.84 ms per token,    18.24 tokens per second)
llama_perf_context_print:       total time =    7285.89 ms /   320 tokens
llama_perf_context_print:    graphs reused =         94
Llama.generate: 221 prefix-match hit, remaining 1 prompt tokens to eval


KeyboardInterrupt: 