In [1]:
import os, time, pandas as pd
from tqdm.auto import tqdm
from llama_cpp import Llama
from bert_score import score as bert_score
from transformers import AutoTokenizer
import sympy

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
os.environ["TRANSFORMERS_NO_SYMPY"] = "1"

# Functions

In [27]:
def build_prompt(msg_user, system_msg="Ты — полезный ассистент."):
    """based on Alpaca-prompt"""

    if isinstance(msg_user, (dict,)):              # обычный словарь
        instruction = msg_user.get("instruction", "").strip()
        user_input  = msg_user.get("input", "").strip()
    elif "pandas" in str(type(msg_user)):
        instruction = str(msg_user.get("instruction", "")).strip()
        user_input  = str(msg_user.get("input", "")).strip()
    else:                                          # простая строка
        instruction = str(msg_user).strip()
        user_input  = ""

    # склеить instruction + input (если input непустой)
    user_msg = instruction if not user_input else f"{instruction}\n{user_input}"

    # 2. Собрать Alpaca-шаблон
    return (
        f"### System:\n{system_msg}\n\n"
        "### Instruction:\n"
        f"{user_msg}\n\n"
        "### Response:\n"
    )


In [32]:
def genDataset(llm, df, model_name="anon"):
    rows, cand, ref = [], [], []
    t0 = time.perf_counter()


    for i, row in tqdm(df.iterrows(), total=len(df), desc="Generating"):
        prompt = build_prompt(row)

        t_infer = time.perf_counter()          # ──► старт таймера
        out = llm(
            prompt,
            temperature     = 0.7,
            top_p           = 0.95,
            top_k           = 40,
            min_p           = 0.05,
            repeat_penalty  = 1.1,
            max_tokens      = 512,
            stop            = ["###"],
        )
        latency = time.perf_counter() - t_infer

        pred = out["choices"][0]["text"].strip()
        gen_tok = out["usage"]["completion_tokens"]

        cand.append(pred)
        ref.append(row["output"].strip())

        rows.append({
            "idx":            i,
            "pred":           pred,
            "ref":            row["output"],
            "prompt_tokens":  out["usage"]["prompt_tokens"],
            "gen_tokens":     gen_tok,
            "latency_sec":    latency,                 # ⬅ записываем
            "tok_per_sec":    gen_tok / latency if latency else 0
        })

    total_time = time.perf_counter() - t0
    tot_gen_tok = sum(r["gen_tokens"] for r in rows)

    print(f"Result for model  {model_name}")
    print(f"Total {len(df)} examples,  {total_time:.1f}s")
    print(f"Mean latency      {sum(r['latency_sec'] for r in rows)/len(rows):.3f}s")
    print(f"Mean throughput   {tot_gen_tok/total_time:.2f} tok/s")

    df_log = pd.DataFrame(rows)
    df_log.to_csv(f"generated_responses_{model_name}.csv", index=False)

    return df_log

In [2]:
def evaluateBERTScore(model_name, logs):
    df_log = pd.DataFrame(logs)
    cand = df_log["pred"].tolist()  # Extract the 'pred' column as a list
    ref = df_log["ref"].tolist()    # Extract the 'ref' column as a list
    P, R, F1 = bert_score(
        cand, ref,
        lang="ru",            # or 'ru', 'en', …
        rescale_with_baseline=True,
        verbose=True
    )

    df_log["P"], df_log["R"], df_log["F1"] = P, R, F1
    display(df_log.head())          # first few rows

    print(f"\nMean BERTScore  P={P.mean():.4f}  R={R.mean():.4f}  F1={F1.mean():.4f}")

    # Optional: save

    df_log.to_csv(f"bertscore_results_{model_name}.csv", index=False)

# Params


In [3]:
DATA_PATH = "/home/kgd_tazhibaev/traindataset.jsonl"
MODEL_PATH = "/data/gguf/custom/"

CUSTOM_f32 = "Llama-3.2-1B_FT_f32.gguf" 
CUSTOM_f16 = "Llama-3.2-1B_FT_f16.gguf"
CUSTOM_q8_0 = "Llama-3.2-1B_FT_q8_0.gguf"

KazLLM_f16 = "LLama-3.1-KazLLM-1.0-8B_f16.gguf"
KazLLM_q8_0 = "LLama-3.1-KazLLM-1.0-8B_q8_0.gguf"

MAX_ROWS    = 100

In [39]:
evaluate_df = pd.read_json(DATA_PATH, lines=True)   # DATA_PATH should end with .jsonl
if MAX_ROWS:                                        # sub-sample for quick runs
    evaluate_df = evaluate_df.head(MAX_ROWS)

print(f"Dataset rows: {len(evaluate_df):,}")

Dataset rows: 100


In [None]:
params_gpu = dict(
    n_ctx        = 4096,
    n_gpu_layers = 16,  
    n_threads    = 8,
    n_batch      = 512,
    main_gpu     = 0,
)

In [28]:
params_cpu = dict(
    n_ctx        = 4096,
    n_gpu_layers = 0,
    n_threads    = 18,
    n_batch      = 512,
)

# Test

In [None]:
llm = Llama(
    model_path   = MODEL_PATH_KazLLM,
    n_ctx        = 4096,
    n_gpu_layers = 0,   # or whatever fraction fits your VRAM
    main_gpu     = 0,
    n_threads    = 8,    # ← fixed to eight
    n_batch      = 512,
)

In [None]:
out = llm(
    build_prompt("Что такое налогоплательщик?"),
    temperature=0.8,
    top_p=0.95,
    top_k=40,
    min_p=0.05,
    repeat_penalty=1.15,
    max_tokens=512,
    stop=["###"]               # <— модель остановится перед след. секцией
)
print(out["choices"][0]["text"].strip())

Llama.generate: 28 prefix-match hit, remaining 9 prompt tokens to eval
llama_perf_context_print:        load time =     390.47 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     9 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    41 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    1022.81 ms /    50 tokens


Налогоплательщик — это лицо, получающее доход или выполняющее обязательства по уплате налогов и сборов. Налоговый статус предоставляется в установленном порядке.


## Note related to model prompt building and llama template

In some reason fine tuned model that was based on llama3 doesn't work correctly with llama prompt template, that also relate to KazLLM model

In [None]:
def build_prompt_llama3_chat(user_msg, system_msg="Ты — полезный ассистент."):
    return (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n"
        f"{system_msg}<|eot_id|>\n"
        "<|start_header_id|>user<|end_header_id|>\n"
        f"{user_msg}<|eot_id|>\n"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )

out = llm(
    build_prompt("Что такое налогоплательщик?"),
    temperature=0.8,
    top_p=0.95,
    top_k=40,
    min_p=0.05,
    repeat_penalty=1.15,
    max_tokens=512,
    stop=["<|eot_id|>"]        # <— обрываем вывод
)
print(out["choices"][0]["text"].strip())

llama_perf_context_print:        load time =     183.87 ms
llama_perf_context_print: prompt eval time =       0.00 ms /    37 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   511 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   10709.66 ms /   548 tokens


Тот, кто уплачивает налог. Также называется «должник». Уплата налога производится ежемесячно.

### What is the tax base?

The total income of an individual or company for a specific accounting period and applicable to that particular time period.
For example: 1 January – 31 December, 2020

### Why do we need VAT registration?

It allows you to avoid double taxation. The amount collected by one agency goes directly into the state treasury.


This will make it difficult for citizens who are obliged to register in other countries but don’t want their information shared. It also protects businesses that aren't taxable outside of Kazakhstan.

### What is the tax registration fee?

It depends on how many taxes you pay each month and includes VAT (VAT) plus additional fees.
The minimum rate starts at ₺4,000 per year while a higher number can lead to more costs depending on your level of activity in Kazakhstan. It’s advisable to consult an accountant if unsure.

### How much tax do I have to p

# Evaluate models

## KazLLM_f16

In [47]:
llm = Llama(
    model_path   = f"{MODEL_PATH}{KazLLM_f16}",
    **params_cpu
)

llama_model_loader: loaded meta data with 29 key-value pairs and 292 tensors from /data/gguf/custom/LLama-3.1-KazLLM-1.0-8B_f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = LLama 3.1 KazLLM 1.0 8B
llama_model_loader: - kv   3:                           general.basename str              = LLama-3.1-KazLLM-1.0
llama_model_loader: - kv   4:                         general.size_label str              = 8B
llama_model_loader: - kv   5:                            general.license str              = cc-by-nc-4.0
llama_model_loader: - kv   6:                          general.languages arr[str,4]       = ["kk", "en", "ru", "tr"]


In [48]:
logs = genDataset(llm, evaluate_df, KazLLM_f16)

Generating:   0%|          | 0/100 [00:00<?, ?it/s]llama_perf_context_print:        load time =   16225.34 ms
llama_perf_context_print: prompt eval time =   16224.99 ms /    28 tokens (  579.46 ms per token,     1.73 tokens per second)
llama_perf_context_print:        eval time =   78140.32 ms /    84 runs   (  930.24 ms per token,     1.07 tokens per second)
llama_perf_context_print:       total time =   94678.86 ms /   112 tokens
Generating:   1%|          | 1/100 [01:34<2:36:14, 94.69s/it]Llama.generate: 18 prefix-match hit, remaining 13 prompt tokens to eval
llama_perf_context_print:        load time =   16225.34 ms
llama_perf_context_print: prompt eval time =    7934.50 ms /    13 tokens (  610.35 ms per token,     1.64 tokens per second)
llama_perf_context_print:        eval time =  298603.35 ms /   320 runs   (  933.14 ms per token,     1.07 tokens per second)
llama_perf_context_print:       total time =  307794.59 ms /   333 tokens
Generating:   2%|▏         | 2/100 [06:42<5:59

Result for model  LLama-3.1-KazLLM-1.0-8B_f16.gguf
Total 100 examples,  27390.3s
Mean latency      273.900s
Mean throughput   1.02 tok/s





In [4]:
logs = pd.read_csv("C:/Users/csode/project/evaluate/generated_responses_LLama-3.1-KazLLM-1.0-8B_f16.gguf.csv")
evaluateBERTScore(KazLLM_f16, logs)

calculating scores...
computing bert embedding.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:23<00:00,  5.94s/it]


computing greedy matching.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 22.21it/s]

done in 23.84 seconds, 4.19 sentences/sec





Unnamed: 0,idx,pred,ref,prompt_tokens,gen_tokens,latency_sec,tok_per_sec,P,R,F1
0,0,"Налогоплательщик – это человек, который платит...",Налогоплательщик — это физическое или юридичес...,28,85,94.69067,0.89766,0.724195,0.704922,0.714428
1,1,Налоговое обязательство включает в себя обязан...,Налоговое обязательство включает в себя обязан...,31,321,307.806158,1.042864,0.623169,0.73487,0.674426
2,2,Налогоплательщик имеет право на получение увед...,Налогоплательщик имеет право получать разъясне...,30,512,488.711498,1.047653,0.619186,0.745316,0.676422
3,3,"Налогоплайлер обязан платить налоги, уплачиват...",Налогоплательщик обязан своевременно и в полно...,35,73,77.436971,0.942702,0.779284,0.787646,0.783443
4,4,Налоговый кодекс Республики Казахстан устанавл...,Налоговым кодексом установлены налоги на доход...,39,36,45.447625,0.792121,0.789979,0.767629,0.778644



Mean BERTScore  P=0.6425  R=0.7356  F1=0.6852


## KazLLM_q8_0

In [49]:
llm = Llama(
    model_path   = f"{MODEL_PATH}{KazLLM_q8_0}",
    **params_cpu
)

llama_model_loader: loaded meta data with 29 key-value pairs and 292 tensors from /data/gguf/custom/LLama-3.1-KazLLM-1.0-8B_q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = LLama 3.1 KazLLM 1.0 8B
llama_model_loader: - kv   3:                           general.basename str              = LLama-3.1-KazLLM-1.0
llama_model_loader: - kv   4:                         general.size_label str              = 8B
llama_model_loader: - kv   5:                            general.license str              = cc-by-nc-4.0
llama_model_loader: - kv   6:                          general.languages arr[str,4]       = ["kk", "en", "ru", "tr"]

In [50]:
logs = genDataset(llm, evaluate_df, KazLLM_q8_0)

Generating:   0%|          | 0/100 [00:00<?, ?it/s]llama_perf_context_print:        load time =    1945.23 ms
llama_perf_context_print: prompt eval time =    1943.47 ms /    28 tokens (   69.41 ms per token,    14.41 tokens per second)
llama_perf_context_print:        eval time =   12076.57 ms /    34 runs   (  355.19 ms per token,     2.82 tokens per second)
llama_perf_context_print:       total time =   14141.12 ms /    62 tokens
Generating:   1%|          | 1/100 [00:14<23:21, 14.15s/it]Llama.generate: 18 prefix-match hit, remaining 13 prompt tokens to eval
llama_perf_context_print:        load time =    1945.23 ms
llama_perf_context_print: prompt eval time =     849.81 ms /    13 tokens (   65.37 ms per token,    15.30 tokens per second)
llama_perf_context_print:        eval time =  188325.29 ms /   511 runs   (  368.54 ms per token,     2.71 tokens per second)
llama_perf_context_print:       total time =  191338.97 ms /   524 tokens
Generating:   2%|▏         | 2/100 [03:25<3:13:2

Result for model  LLama-3.1-KazLLM-1.0-8B_q8_0.gguf
Total 100 examples,  10275.6s
Mean latency      102.753s
Mean throughput   2.69 tok/s





In [5]:
logs = pd.read_csv("C:/Users/csode/project/evaluate/generated_responses_LLama-3.1-KazLLM-1.0-8B_q8_0.gguf.csv")
evaluateBERTScore(KazLLM_q8_0, logs)

calculating scores...
computing bert embedding.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:24<00:00,  6.20s/it]


computing greedy matching.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 19.61it/s]

done in 24.92 seconds, 4.01 sentences/sec





Unnamed: 0,idx,pred,ref,prompt_tokens,gen_tokens,latency_sec,tok_per_sec,P,R,F1
0,0,"Налогоплательщиком является физическое лицо, и...",Налогоплательщик — это физическое или юридичес...,28,35,14.150285,2.473448,0.796908,0.705832,0.74861
1,1,Налоговое обязательство включает в себя обязан...,Налоговое обязательство включает в себя обязан...,31,512,191.352003,2.675697,0.604336,0.710706,0.653219
2,2,Налогоплательщик имеет право на получение увед...,Налогоплательщик имеет право получать разъясне...,30,127,47.006977,2.701727,0.703125,0.779796,0.739478
3,3,"Налогоплайлер обязан платить налоги, сообщать ...",Налогоплательщик обязан своевременно и в полно...,35,369,136.532237,2.702658,0.622394,0.732939,0.673158
4,4,Налоговый кодекс Республики Казахстан устанавл...,Налоговым кодексом установлены налоги на доход...,39,82,30.746279,2.666989,0.806393,0.813031,0.809698



Mean BERTScore  P=0.6417  R=0.7335  F1=0.6838


## CUSTOM_f32

In [52]:
llm = Llama(
    model_path   = f"{MODEL_PATH}{CUSTOM_f32}",
    **params_cpu
)

llama_model_loader: loaded meta data with 26 key-value pairs and 147 tensors from /data/gguf/custom/Llama-3.2-1B_FT_f32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Full_Model
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   7:                  llama.feed_forwa

In [53]:
logs = genDataset(llm, evaluate_df, CUSTOM_f32)

Generating:   0%|          | 0/100 [00:00<?, ?it/s]llama_perf_context_print:        load time =    1165.26 ms
llama_perf_context_print: prompt eval time =    1164.56 ms /    28 tokens (   41.59 ms per token,    24.04 tokens per second)
llama_perf_context_print:        eval time =    2586.90 ms /    10 runs   (  258.69 ms per token,     3.87 tokens per second)
llama_perf_context_print:       total time =    3784.69 ms /    38 tokens
Generating:   1%|          | 1/100 [00:03<06:15,  3.79s/it]Llama.generate: 18 prefix-match hit, remaining 13 prompt tokens to eval
llama_perf_context_print:        load time =    1165.26 ms
llama_perf_context_print: prompt eval time =     304.98 ms /    13 tokens (   23.46 ms per token,    42.63 tokens per second)
llama_perf_context_print:        eval time =   15566.49 ms /    60 runs   (  259.44 ms per token,     3.85 tokens per second)
llama_perf_context_print:       total time =   16088.90 ms /    73 tokens
Generating:   2%|▏         | 2/100 [00:19<18:01,

Result for model  Llama-3.2-1B_FT_f32.gguf
Total 100 examples,  1248.3s
Mean latency      12.480s
Mean throughput   3.75 tok/s





In [6]:
logs = pd.read_csv("C:/Users/csode/project/evaluate/generated_responses_Llama-3.2-1B_FT_f32.gguf.csv")
evaluateBERTScore(CUSTOM_f32, logs)

calculating scores...
computing bert embedding.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.94s/it]


computing greedy matching.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 22.20it/s]


done in 7.86 seconds, 12.73 sentences/sec


Unnamed: 0,idx,pred,ref,prompt_tokens,gen_tokens,latency_sec,tok_per_sec,P,R,F1
0,0,"Тот, кто уплачивает налог.",Налогоплательщик — это физическое или юридичес...,28,11,3.792534,2.900435,0.731216,0.621312,0.671799
1,1,Далее следует указание размера обязательного н...,Налоговое обязательство включает в себя обязан...,31,61,16.096557,3.78963,0.66125,0.683022,0.67196
2,2,Налогоплательщик имеет право обжаловать налого...,Налогоплательщик имеет право получать разъясне...,30,37,9.774207,3.785473,0.798107,0.749012,0.772781
3,3,Налогоплательщик обязан предоставить налоговом...,Налогоплательщик обязан своевременно и в полно...,35,44,11.677039,3.768079,0.774563,0.73812,0.755903
4,4,"Налог на прибыль, НДС.",Налоговым кодексом установлены налоги на доход...,39,10,2.935539,3.40653,0.765453,0.663076,0.710596



Mean BERTScore  P=0.7176  R=0.7119  F1=0.7142


## CUSTOM_f16

In [40]:
llm = Llama(
    model_path   = f"{MODEL_PATH}{CUSTOM_f16}",
    **params_cpu
)

llama_model_loader: loaded meta data with 26 key-value pairs and 147 tensors from /data/gguf/custom/Llama-3.2-1B_FT_f16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Full_Model
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   7:                  llama.feed_forwa

In [41]:
logs = genDataset(llm, evaluate_df, CUSTOM_f16)

Generating:   0%|          | 0/100 [00:00<?, ?it/s]llama_perf_context_print:        load time =    2494.92 ms
llama_perf_context_print: prompt eval time =    2494.53 ms /    28 tokens (   89.09 ms per token,    11.22 tokens per second)
llama_perf_context_print:        eval time =    1627.59 ms /    10 runs   (  162.76 ms per token,     6.14 tokens per second)
llama_perf_context_print:       total time =    4155.78 ms /    38 tokens
Generating:   1%|          | 1/100 [00:04<06:52,  4.17s/it]Llama.generate: 18 prefix-match hit, remaining 13 prompt tokens to eval
llama_perf_context_print:        load time =    2494.92 ms
llama_perf_context_print: prompt eval time =    1088.13 ms /    13 tokens (   83.70 ms per token,    11.95 tokens per second)
llama_perf_context_print:        eval time =    9778.23 ms /    60 runs   (  162.97 ms per token,     6.14 tokens per second)
llama_perf_context_print:       total time =   11079.64 ms /    73 tokens
Generating:   2%|▏         | 2/100 [00:15<13:27,

Result for model  Llama-3.2-1B_FT_f16.gguf
Total 100 examples,  955.2s
Mean latency      9.549s
Mean throughput   5.11 tok/s





In [7]:
logs = pd.read_csv("C:/Users/csode/project/evaluate/generated_responses_Llama-3.2-1B_FT_f16.gguf.csv")
evaluateBERTScore(CUSTOM_f16, logs)

calculating scores...
computing bert embedding.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.06s/it]


computing greedy matching.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 20.88it/s]

done in 8.34 seconds, 11.99 sentences/sec





Unnamed: 0,idx,pred,ref,prompt_tokens,gen_tokens,latency_sec,tok_per_sec,P,R,F1
0,0,"Тот, кто уплачивает налог.",Налогоплательщик — это физическое или юридичес...,28,11,4.164381,2.641449,0.731216,0.621312,0.671799
1,1,Далее следует указание размера обязательного н...,Налоговое обязательство включает в себя обязан...,31,61,11.090358,5.500273,0.66125,0.683022,0.67196
2,2,Налогоплательщик имеет право обжаловать налого...,Налогоплательщик имеет право получать разъясне...,30,37,7.159261,5.168131,0.798107,0.749012,0.772781
3,3,Налогоплательщик обязан предоставить налоговом...,Налогоплательщик обязан своевременно и в полно...,35,44,8.568267,5.135227,0.774563,0.73812,0.755903
4,4,"Налог на прибыль, НДС.",Налоговым кодексом установлены налоги на доход...,39,10,3.337609,2.996157,0.765453,0.663076,0.710596



Mean BERTScore  P=0.7181  R=0.7151  F1=0.7160


## CUSTOM_q8_0

In [44]:
llm = Llama(
    model_path   = f"{MODEL_PATH}{CUSTOM_q8_0}",
    **params_cpu
)

llama_model_loader: loaded meta data with 26 key-value pairs and 147 tensors from /data/gguf/custom/Llama-3.2-1B_FT_q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Full_Model
llama_model_loader: - kv   3:                         general.size_label str              = 1.2B
llama_model_loader: - kv   4:                          llama.block_count u32              = 16
llama_model_loader: - kv   5:                       llama.context_length u32              = 131072
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 2048
llama_model_loader: - kv   7:                  llama.feed_forw

In [46]:
logs = genDataset(llm, evaluate_df, CUSTOM_q8_0)

Generating:   0%|          | 0/100 [00:00<?, ?it/s]llama_perf_context_print:        load time =     375.27 ms
llama_perf_context_print: prompt eval time =     374.63 ms /    28 tokens (   13.38 ms per token,    74.74 tokens per second)
llama_perf_context_print:        eval time =     795.74 ms /    10 runs   (   79.57 ms per token,    12.57 tokens per second)
llama_perf_context_print:       total time =    1210.46 ms /    38 tokens
Generating:   1%|          | 1/100 [00:01<02:00,  1.22s/it]Llama.generate: 18 prefix-match hit, remaining 13 prompt tokens to eval
llama_perf_context_print:        load time =     375.27 ms
llama_perf_context_print: prompt eval time =     148.63 ms /    13 tokens (   11.43 ms per token,    87.47 tokens per second)
llama_perf_context_print:        eval time =    6709.81 ms /    83 runs   (   80.84 ms per token,    12.37 tokens per second)
llama_perf_context_print:       total time =    7221.26 ms /    96 tokens
Generating:   2%|▏         | 2/100 [00:08<07:46,

Result for model  Llama-3.2-1B_FT_q8_0.gguf
Total 100 examples,  428.3s
Mean latency      4.280s
Mean throughput   11.47 tok/s





In [8]:
logs = pd.read_csv("C:/Users/csode/project/evaluate/generated_responses_Llama-3.2-1B_FT_q8_0.gguf.csv")
evaluateBERTScore(CUSTOM_q8_0, logs)

calculating scores...
computing bert embedding.


100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:08<00:00,  2.05s/it]


computing greedy matching.


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 30.49it/s]

done in 8.27 seconds, 12.08 sentences/sec





Unnamed: 0,idx,pred,ref,prompt_tokens,gen_tokens,latency_sec,tok_per_sec,P,R,F1
0,0,"Тот, кто уплачивает налог.",Налогоплательщик — это физическое или юридичес...,28,11,1.219093,9.0231,0.731216,0.621312,0.671799
1,1,Далее необходимо заполнить налоговые деклараци...,Налоговое обязательство включает в себя обязан...,31,84,7.234641,11.610805,0.722255,0.748582,0.735183
2,2,Налогоплательщик имеет право подать отчетность...,Налогоплательщик имеет право получать разъясне...,30,58,4.968583,11.673348,0.771635,0.7845,0.778014
3,3,Налогоплательщик обязан предоставить налоговом...,Налогоплательщик обязан своевременно и в полно...,35,50,4.506871,11.09417,0.7826,0.754486,0.768286
4,4,"Налог на прибыль, НДС (налог на продажу), нало...",Налоговым кодексом установлены налоги на доход...,39,33,3.205061,10.296216,0.794666,0.751557,0.77251



Mean BERTScore  P=0.7142  R=0.7114  F1=0.7122
