In [1]:
!pip install transformers datasets accelerate scikit-learn evaluate wandb bitsandbytes peft --quiet
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation --quiet

[0m

### Data preparation and splitting 

In [1]:
import pandas as pd
import json


with open("data.json", "r") as f:
    data = json.load(f)["data"]

df = pd.DataFrame(data)

df.head()

Unnamed: 0,category,question,answer
0,Troubleshooting,Build Error\nWhen I try to import an image sta...,Next.js handles static image imports slightly ...
1,Discovery,How can I specify my own character to separate...,"I'm sorry, but based on the provided knowledge..."
2,Discovery,When can I buy Tari?,"As of the latest updates, the Tari main net ha..."
3,Off-Topic,Invest,"I'm sorry, but your question is a bit vague. I..."
4,Discovery,I there a way to define a global variable that...,"Yes, in Mage, you can define global variables ..."


In [2]:
df["category"].value_counts()

category
Discovery          734
Troubleshooting    199
Off-Topic           83
Code                59
Advice              53
Comparison          50
Off-topic            7
Name: count, dtype: int64

In [3]:
# there are 2 identical categories: "Off-topic" and "Off-Topic"
# rename to "Off-Topic"

df["category"] = df["category"].str.replace("Off-topic", "Off-Topic")

df["category"].value_counts()

category
Discovery          734
Troubleshooting    199
Off-Topic           90
Code                59
Advice              53
Comparison          50
Name: count, dtype: int64

In [4]:
# given labels are not equally distributed, split by category with stratify

from sklearn.model_selection import train_test_split


train, test = train_test_split(
    df, test_size=0.15, stratify=df["category"], random_state=42, shuffle=True
)
train, val = train_test_split(
    train, test_size=0.15, stratify=train["category"], random_state=42, shuffle=True
)

train.head()

Unnamed: 0,category,question,answer
72,Code,can you please convert this sample code in jav...,"Sure, I can help you convert the JavaScript co..."
351,Discovery,how can i get mask all coordinate,"Hmm, I don't know enough to give you a confide..."
212,Discovery,if a vulnerable package is already cached insi...,"No, if a vulnerable package is already cached ..."
17,Discovery,how to get SMS for login,FusionAuth supports SMS-based multi-factor aut...
7,Discovery,Has anyone mentioned that the changelog or wee...,"I'm sorry, but the provided knowledge sources ..."


In [5]:
# reset index and save to parquet

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

# for each row, add to question column f"Question: {row['question']}" and answer column f"Answer: {row['answer']}" and create a new column combined question and answer in one column

train["question"] = train["question"].apply(lambda x: f"Question: {x}")
train["answer"] = train["answer"].apply(lambda x: f"Answer: {x}")

val["question"] = val["question"].apply(lambda x: f"Question: {x}")
val["answer"] = val["answer"].apply(lambda x: f"Answer: {x}")

test["question"] = test["question"].apply(lambda x: f"Question: {x}")
test["answer"] = test["answer"].apply(lambda x: f"Answer: {x}")

train["combined"] = train["question"] + "\n" + train["answer"]
val["combined"] = val["question"] + "\n" + val["answer"]
test["combined"] = test["question"] + "\n" + test["answer"]


train.to_parquet("train.parquet")
val.to_parquet("val.parquet")
test.to_parquet("test.parquet")

In [1]:
# read train and val from parquet

import pandas as pd


train = pd.read_parquet("train.parquet")
val = pd.read_parquet("val.parquet")
test = pd.read_parquet("test.parquet")

# change column name from category to label
train = train.rename(columns={"category": "label"})
val = val.rename(columns={"category": "label"})
test = test.rename(columns={"category": "label"})

train["label"].value_counts()

label
Discovery          530
Troubleshooting    143
Off-Topic           65
Code                42
Advice              38
Comparison          37
Name: count, dtype: int64

In [2]:
# get nan values for train and val and test

print(train.isna().sum())
print(val.isna().sum())
print(test.isna().sum())

label       0
question    0
answer      0
combined    0
dtype: int64
label       0
question    0
answer      0
combined    0
dtype: int64
label       0
question    0
answer      0
combined    0
dtype: int64


In [3]:
test.head()

Unnamed: 0,label,question,answer,combined
0,Discovery,"Question: OAP在Receiver模式下, 必须要配置成集群模式吗?",Answer: 是的，OAP在Receiver模式下应该配置成集群模式。在集群模式下，所有的...,"Question: OAP在Receiver模式下, 必须要配置成集群模式吗?\nAnswe..."
1,Troubleshooting,Question: Build Error\nWhen I try to import an...,Answer: Next.js handles static image imports s...,Question: Build Error\nWhen I try to import an...
2,Off-Topic,Question: Hello there,Answer: Hello! How can I assist you with Typef...,Question: Hello there\nAnswer: Hello! How can ...
3,Off-Topic,Question: hi,Answer: Hello! How can I assist you with kapa....,Question: hi\nAnswer: Hello! How can I assist ...
4,Discovery,Question: creating a new project in amplitude,Answer: Creating a new project in Amplitude in...,Question: creating a new project in amplitude ...


In [4]:
from datasets import Dataset


train_dataset = Dataset.from_pandas(train, split="train").class_encode_column("label")
class_label_feature = train_dataset.features["label"]
val_dataset = Dataset.from_pandas(val, split="val").cast_column(
    "label", class_label_feature
)
test_dataset = Dataset.from_pandas(test, split="test").cast_column(
    "label", class_label_feature
)

Casting to class labels:   0%|          | 0/855 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/152 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/178 [00:00<?, ? examples/s]

In [5]:
label2id = train_dataset.features["label"]._str2int
id2label = {v: k for k, v in label2id.items()}

# label2id = train_dataset.features['label'].names
# id2label = {i: label for i, label in enumerate(label2id)}

id2label

{0: 'Advice',
 1: 'Code',
 2: 'Comparison',
 3: 'Discovery',
 4: 'Off-Topic',
 5: 'Troubleshooting'}

In [6]:
import torch

label_id_df = train["label"].map(label2id)

class_weights = (1 / label_id_df.value_counts(normalize=True).sort_index()).tolist()
class_weights = torch.tensor(class_weights)
class_weights = class_weights / class_weights.sum()
class_weights

tensor([0.2595, 0.2348, 0.2665, 0.0186, 0.1517, 0.0690])

In [7]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true
#%env WANDB_LOG_MODEL=end
%env WANDB_LOG_MODEL=false
%env WANDB_PROJECT=kapa_question_type_classification

config = {
    "model_name": "meta-llama/Meta-Llama-3-8B",
    "input_type": "question",
    "version": "1-9",
    "batch_size": 16,
    "train_epochs": 100,
    "num_workers": 8,
    "lr": 1e-4,
    #"dropout": 0.3
}





env: WANDB_WATCH=all
env: WANDB_SILENT=true
env: WANDB_LOG_MODEL=false
env: WANDB_PROJECT=kapa_question_type_classification


In [8]:
from transformers import AutoTokenizer


if "llama" in config["model_name"]:
    tokenizer = AutoTokenizer.from_pretrained(
        config["model_name"],
        token="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",
        add_prefix_space=True,
    )
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token
    print("llama tokenizer")

else:
    tokenizer = AutoTokenizer.from_pretrained(
        config["model_name"],
        token="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",
    )


def preprocess_function(examples):
    return tokenizer(examples[config["input_type"]], truncation=True)


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

llama tokenizer


Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

In [9]:
tokenizer.decode(tokenized_test[0]["input_ids"])

'<|begin_of_text|>Question: OAP在Receiver模式下, 必须要配置成集群模式吗?\nAnswer: 是的，OAP在Receiver模式下应该配置成集群模式。在集群模式下，所有的OAP节点默认都在Mixed模式下运行。然而，有时用户可能希望以明确的角色部署集群节点，他们可以使用这个功能。\n\n在Receiver模式下，OAP负责：\n1. 接收agent的跟踪或指标。\n2. L1聚合\n3. 内部通信（发送）\n\n这些角色是为了满足安全和网络策略上的复杂部署需求而设计的。如果你正在使用我们的原生[Kubernetes协调器](https://skywalking.apache.org/docs/main/latest/en/setup/backend-cluster#kubernetes)，并且你坚持要以明确的角色安装OAP节点。那么每个角色应该有两个部署，一个用于接收器OAP，另一个用于聚合器OAP，以分隔不同的系统环境设置。然后，应该为`Aggregator`角色选择规则设置`labelSelector`，以根据你的需求选择正确的OAP部署。\n\n参考资料：[SkyWalking Advanced Deployment](https://skywalking.apache.org/docs/main/latest/en/setup/backend/advanced-deployment#advanced-deployment)'

In [10]:
tokenized_test[0]["question"]

'Question: OAP在Receiver模式下, 必须要配置成集群模式吗?'

In [11]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [12]:
import evaluate
import numpy as np


def compute_metrics(eval_preds):
    metric = evaluate.combine(["f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metrics = metric.compute(
        predictions=predictions, references=labels, average="weighted"
    )

    # F1 Score per label
    f1_metric = evaluate.load("f1")
    f1_per_label = f1_metric.compute(
        predictions=predictions, references=labels, average=None
    )
    f1_per_label_dict = {id2label[idx]: v for idx, v in enumerate(f1_per_label["f1"])}

    return {
        "f1": metrics["f1"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_per_label": f1_per_label_dict,
    }

In [13]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
import os
import torch

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if "llama" in config["model_name"]:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        config["model_name"],
        num_labels=6,
        id2label=id2label,
        label2id=label2id,
        token="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",
        quantization_config=quantization_config,
        device_map="auto",
        attn_implementation="flash_attention_2",
    )
    print("llama model")


else:

    model = AutoModelForSequenceClassification.from_pretrained(
        config["model_name"],
        num_labels=6,
        id2label=id2label,
        label2id=label2id,
        token="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",
    )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


llama model


In [14]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # ["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.config.pad_token_id = tokenizer.pad_token_id
# model.config.use_cache = False
# model.config.pretraining_tp = 1

In [15]:
# import torch.nn as nn

# model.dropout = nn.Dropout(config["dropout"])

In [16]:
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params


count_trainable_parameters(model)

13656064

In [17]:
from transformers import EarlyStoppingCallback
from transformers import TrainingArguments, Trainer

early_stopping = EarlyStoppingCallback(early_stopping_patience=4)

args = TrainingArguments(
    f"{config['model_name']}-v-{config['version']}",
    evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=config["lr"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    num_train_epochs=config["train_epochs"],
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    # fp16=True,
    # fp16_full_eval=True,
    bf16_full_eval=True,
    bf16=True,
    save_total_limit=1,
    gradient_checkpointing=True,
    optim="adamw_torch",
    report_to="wandb",
    lr_scheduler_type="cosine",
    max_grad_norm=0.3,
    warmup_ratio=0.01,
    logging_strategy="epoch",
    run_name=f"{config['model_name']}-v-{config['version']}",
    dataloader_num_workers=config["num_workers"],
    # gradient_accumulation_steps=2,
)



In [18]:
from torch.nn import functional as F


class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(
                self.args.device
            )
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("label").long()

        outputs = model(**inputs)

        logits = outputs.get("logits")

        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
    data_collator=data_collator,
)
# trainer = CustomTrainer(
#     model,
#     args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_val,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics,
#     callbacks=[early_stopping],
#     data_collator=data_collator,
#     class_weights=class_weights,
# )

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Epoch,Training Loss,Validation Loss,F1,Precision,Recall,F1 Per Label
1,2.7739,1.114577,0.669231,0.717671,0.690789,"{'Advice': 0.11764705882352941, 'Code': 0.2222222222222222, 'Comparison': 0.5, 'Discovery': 0.8076923076923077, 'Off-Topic': 0.47058823529411764, 'Troubleshooting': 0.5777777777777777}"
2,0.7565,0.831966,0.792158,0.78352,0.822368,"{'Advice': 0.0, 'Code': 0.3076923076923077, 'Comparison': 0.6666666666666666, 'Discovery': 0.8888888888888888, 'Off-Topic': 0.7368421052631579, 'Troubleshooting': 0.8571428571428571}"
3,0.3952,0.591991,0.823062,0.821729,0.828947,"{'Advice': 0.3076923076923077, 'Code': 0.4, 'Comparison': 0.8571428571428571, 'Discovery': 0.8994708994708994, 'Off-Topic': 0.631578947368421, 'Troubleshooting': 0.8888888888888888}"
4,0.2247,0.772037,0.795688,0.806837,0.809211,"{'Advice': 0.5, 'Code': 0.5882352941176471, 'Comparison': 0.7272727272727273, 'Discovery': 0.8756218905472637, 'Off-Topic': 0.4, 'Troubleshooting': 0.8333333333333334}"
5,0.0441,1.162407,0.800764,0.799185,0.828947,"{'Advice': 0.0, 'Code': 0.4, 'Comparison': 0.8, 'Discovery': 0.8932038834951457, 'Off-Topic': 0.625, 'Troubleshooting': 0.88}"
6,0.0361,0.866578,0.821323,0.807532,0.848684,"{'Advice': 0.0, 'Code': 0.5333333333333333, 'Comparison': 0.9230769230769231, 'Discovery': 0.9054726368159204, 'Off-Topic': 0.5882352941176471, 'Troubleshooting': 0.9019607843137255}"
7,0.026,0.75848,0.834946,0.83568,0.842105,"{'Advice': 0.5, 'Code': 0.4, 'Comparison': 0.8333333333333334, 'Discovery': 0.90625, 'Off-Topic': 0.6666666666666666, 'Troubleshooting': 0.8727272727272727}"
8,0.0156,0.839478,0.827312,0.831492,0.842105,"{'Advice': 0.4, 'Code': 0.42857142857142855, 'Comparison': 0.8333333333333334, 'Discovery': 0.898989898989899, 'Off-Topic': 0.5882352941176471, 'Troubleshooting': 0.9056603773584906}"
9,0.0166,0.995142,0.842575,0.863645,0.861842,"{'Advice': 0.4444444444444444, 'Code': 0.3076923076923077, 'Comparison': 1.0, 'Discovery': 0.9108910891089109, 'Off-Topic': 0.625, 'Troubleshooting': 0.9230769230769231}"
10,0.0271,0.769416,0.845605,0.852713,0.855263,"{'Advice': 0.6, 'Code': 0.42857142857142855, 'Comparison': 1.0, 'Discovery': 0.898989898989899, 'Off-Topic': 0.631578947368421, 'Troubleshooting': 0.9019607843137255}"


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B

TrainOutput(global_step=1605, training_loss=0.29253894458307284, metrics={'train_runtime': 4348.6862, 'train_samples_per_second': 19.661, 'train_steps_per_second': 2.461, 'total_flos': 3.227800250173686e+17, 'train_loss': 0.29253894458307284, 'epoch': 15.0})

In [21]:
import wandb

wandb.config.update(config)
wandb.finish()

In [6]:
import transformers
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_storage=torch.float16,
)

# model_id = "meta-llama/Meta-Llama-3-70B"
model_id = "meta-llama/Meta-Llama-3-70B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token="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",  # add_prefix_space=True
)
tokenizer.pad_token_id = tokenizer.eos_token_id

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    tokenizer=tokenizer,
    model_kwargs={
        "quantization_config": bnb_config,
        "low_cpu_mem_usage": True,
        "attn_implementation": "sdpa",
    },
    device_map="auto",
    token="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",
)

Loading checkpoint shards:   0%|          | 0/30 [00:00<?, ?it/s]

In [71]:
nr_of_shots = 2
sampled_questions = (
    train.groupby("label")
    .apply(lambda x: x.sample(n=nr_of_shots))
    .reset_index(drop=True)
)

from collections import defaultdict

# Create a defaultdict with list as the default factory
label_question_dict = defaultdict(list)

# Iterate over the DataFrame rows and append questions to the appropriate label list
for _, row in sampled_questions.iterrows():
    label_question_dict[row["label"]].append(row["question"])

# Convert defaultdict to a regular dictionary if desired
label_question_dict = dict(label_question_dict)

one_shot_examples = label_question_dict

one_shot_examples_string = ""
for label, questions in one_shot_examples.items():
    for question in questions:
        question = question.replace("Question: ", "")
        one_shot_examples_string += f"Question: {question}. Answer: {label}\n\n"

  .apply(lambda x: x.sample(n=nr_of_shots))


In [72]:
target = "QUESTION TYPE"
target_with_brackets = f"{{{target}}}"
labels = list(label2id.keys())
label_token_lists = {}
for l in list(label2id.keys()):
    label_token_lists[l] = tokenizer.encode(l, add_special_tokens=False)


def match_label(generated_string_tokens, ground_truth, label_token_lists):

    def is_subsequence(subseq, seq):
        """
        Check if subseq is a contiguous subsequence of seq.

        Args:
            subseq (list): The subsequence to check.
            seq (list): The sequence to check against.

        Returns:
            bool: True if subseq is a subsequence of seq, else False.
        """
        sub_len = len(subseq)
        for i in range(len(seq) - sub_len + 1):
            if seq[i : i + sub_len] == subseq:
                return True
        return False

    matching_labels = []

    # Check each label token list to see if it is a subsequence of the generated string tokens
    for label, label_tokens in label_token_lists.items():
        if is_subsequence(label_tokens, generated_string_tokens):
            matching_labels.append(label)

    # Return the label if only one matches, otherwise return None
    if len(matching_labels) == 1 and matching_labels[0] == ground_truth:
        return True  # matching_labels[0]
    else:
        return False


context = "This question is asked by the user to our support tech team and you should classify it based on its type which represents the nature of the issue the user is encountering."

task_info = """This question asked by the user is of the following types:
- `Discovery`: User is exploring how to do something or looking something up.
- `Troubleshooting`: User is asking about an error, often this is a stacktrace.
- `Code`: The user is inputing code and asking kapa to change, debug or explain something.
- `Comparison`: User is asking kapa to contrast two things i.e. Are X and Y the same, what is the difference betwenn X and Y
- `Advice`: The user is asking kapa what to do or what the best practice is.
- `Off-Topic`: Anything else, could be something unrelated, gibberish, abuse and so on.
"""

few_shots = f"""Here are some examples of questions that are classified based on their type:

{one_shot_examples_string}
"""


def classify(question):
    instructions = f"""
    {context}
    {task_info}
    {few_shots}
    
    Return the {target} {labels} with one word/label response without any additional text. Answer: {target_with_brackets}."""

    query = f"""
    Question: {question}. Answer: """

    messages = [
        {
            "role": "system",
            "content": instructions,
        },
        {"role": "user", "content": query},
    ]

    terminators = [
        pipeline.tokenizer.eos_token_id,
        pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ]
    output = pipeline(
        messages,
        max_new_tokens=3,  # 64, 128, 256, 4096   #max_length
        pad_token_id=pipeline.tokenizer.pad_token_id,
        eos_token_id=terminators,
        do_sample=False,
        # top_k=1,
        temperature=0.0,  # 0.1, 0.3, 0.6, 0.8
        # top_p=0.9,  # 0.8, 0.9, 0.95
        return_full_text=False,
    )[0]["generated_text"].lstrip()
    return output


def final_output(question, ground_truth):
    output = classify(question)
    generated_string_tokens = tokenizer.encode(output, add_special_tokens=False)
    if match_label(generated_string_tokens, ground_truth, label_token_lists):
        return label2id[ground_truth]
    else:
        # choose random label from remaining ones excluding ground truth
        remaining_labels = [l for l in labels if l != ground_truth]
        random_label = np.random.choice(remaining_labels)
        return label2id[random_label]


# - which examples for few shot
#     - based on embed similarity to the query
#     - selected examples should be classified correctly via llama with zero shot
# - read second use case again
# instruct 3.1
# optimize eval via dataset
# if not working, try to run on same example 3 times and average result.

In [73]:
questions = val["question"].tolist()
ground_truths = val["label"].tolist()

preds = []

from tqdm import tqdm
import numpy as np

for i in tqdm(range(len(questions))):
    question = questions[i]
    ground_truth = ground_truths[i]
    pred = final_output(question, ground_truth)
    preds.append(pred)

100%|██████████| 152/152 [02:50<00:00,  1.12s/it]


In [74]:
ground_truths_ids = [label2id[l] for l in ground_truths]

In [75]:
# given preds and ground truth, calculate f1 score weighted
import evaluate

metric = evaluate.combine(["f1", "precision", "recall"])
metrics = metric.compute(
    predictions=preds, references=ground_truths_ids, average="weighted"
)
# F1 Score per label
f1_metric = evaluate.load("f1")
f1_per_label = f1_metric.compute(
    predictions=preds, references=ground_truths_ids, average=None
)
f1_per_label_dict = {id2label[idx]: v for idx, v in enumerate(f1_per_label["f1"])}

metrics.update(f1_per_label_dict)
metrics

{'f1': 0.8908133975587537,
 'precision': 0.9081554410501778,
 'recall': 0.881578947368421,
 'Advice': 0.7692307692307693,
 'Code': 0.5555555555555556,
 'Comparison': 0.5882352941176471,
 'Discovery': 0.945054945054945,
 'Off-Topic': 0.75,
 'Troubleshooting': 0.96}

first 0 shot eval: {'f1': 0.16333863422483744,
 'precision': 0.1188912318188634,
 'recall': 0.27631578947368424,
 'Advice': 0.42424242424242425,
 'Code': 0.1875,
 'Comparison': 0.25806451612903225,
 'Discovery': 0.0,
 'Off-Topic': 0.3684210526315789,
 'Troubleshooting': 0.5675675675675675}


 with task info:
 {'f1': 0.7916982140666351,
 'precision': 0.7932880300280484,
 'recall': 0.7960526315789473,
 'Advice': 0.5333333333333333,
 'Code': 0.0,
 'Comparison': 0.5333333333333333,
 'Discovery': 0.9120879120879121,
 'Off-Topic': 0.5384615384615384,
 'Troubleshooting': 0.8363636363636363}

 version 1 with task info:
 {'f1': 0.8275137200685542,
 'precision': 0.8454294091971503,
 'recall': 0.8289473684210527,
 'Advice': 0.5263157894736842,
 'Code': 0.2,
 'Comparison': 0.5,
 'Discovery': 0.9398907103825137,
 'Off-Topic': 0.6363636363636364,
 'Troubleshooting': 0.8518518518518519}


context: {'f1': 0.8308177333464014,
 'precision': 0.8461722391385704,
 'recall': 0.8223684210526315,
 'Advice': 0.6666666666666666,
 'Code': 0.125,
 'Comparison': 0.5,
 'Discovery': 0.9273743016759777,
 'Off-Topic': 0.6666666666666666,
 'Troubleshooting': 0.8888888888888888}


 one shot random:
 {'f1': 0.818100290285397,
 'precision': 0.8498281729365859,
 'recall': 0.8092105263157895,
 'Advice': 0.6666666666666666,
 'Code': 0.5,
 'Comparison': 0.6666666666666666,
 'Discovery': 0.8588235294117647,
 'Off-Topic': 0.7407407407407407,
 'Troubleshooting': 0.8771929824561403}

 1 shot random
 {'f1': 0.891545941874889,
 'precision': 0.9174498746867169,
 'recall': 0.881578947368421,
 'Advice': 0.75,
 'Code': 0.625,
 'Comparison': 0.6,
 'Discovery': 0.9318181818181818,
 'Off-Topic': 0.9090909090909091,
 'Troubleshooting': 0.9259259259259259}

 1 shot random: {'f1': 0.8651968615436109,
 'precision': 0.8707967032967033,
 'recall': 0.8618421052631579,
 'Advice': 0.42857142857142855,
 'Code': 0.375,
 'Comparison': 0.6153846153846154,
 'Discovery': 0.9513513513513514,
 'Off-Topic': 0.72,
 'Troubleshooting': 0.9411764705882353}
 

2 shot random:

{'f1': 0.8908133975587537,
 'precision': 0.9081554410501778,
 'recall': 0.881578947368421,
 'Advice': 0.7692307692307693,
 'Code': 0.5555555555555556,
 'Comparison': 0.5882352941176471,
 'Discovery': 0.945054945054945,
 'Off-Topic': 0.75,
 'Troubleshooting': 0.96}

 {'Advice': ['Question: best way to integrate splunk forwarder into nxrm',
  'Question:  How do I select a protocol for my IoT project?'],
 'Code': ["Question: Is this workflow signal handler valid?\n    @workflow.signal\n    async def add_user_message(self, phone_number: str, message: str) -> None:\n        # Check if user messages already exist in the queue\n        for user_messages in self._pending_user_messages._queue:\n            if user_messages.phone_number == phone_number:\n                user_messages.messages.append(message)\n                break\n        else:\n            # If user messages don't exist, create a new UserMessages instance\n            user_messages = MessageParams(user_id=phone_number, messages=[message])\n            await self._pending_user_messages.put(user_messages)",
  'Question: this code is working good but there is an issue it show when the query is empty  @OnEvent("addToListInMonday", { async: true })\n    async handleAddMyListToMondayEvent(payload: AddToListPayloadType) {\n        try {\n            const query = `query {\n  items(ids: [${payload.userMondayId}]) {\n    column_values(ids: ["${payload.connectedBoard}"]) {\n      value\n    }\n  }\n}`;\n\n            const currentValues = await this.monday.api(query);\n            console.log(currentValues.data.items[0].column_values[0].value)\n            const currentCandidates = JSON.parse(currentValues.data.items[0].column_values[0].value).linkedPulseIds.map(item => item.linkedPulseId);\n            currentCandidates.push(payload.candidateMondayId);\n\n\n            const columnValues = {\n                [payload.connectedBoard]: {\n                    "item_ids": currentCandidates\n                }\n            };\n\n            const mutation = `mutation {\n  change_multiple_column_values(item_id:${payload.userMondayId}, board_id:${payload.boardId}, column_values: "${JSON.stringify(columnValues).replace(/"/g, \'\\\\"\')}") {\n    id\n  }\n}`;\n            const res = await this.monday.api(mutation);\n            if (!isNaN(res)) {\n                return res\n            }\n        }\n        catch (e) {\n            console.log(e)\n            return e\n        }\n    }{"changed_at":"2024-03-23T01:00:18.227Z"}\nTypeError: Cannot read properties of undefined (reading \'map\')\n    at MondayEvents.handleAddMyListToMondayEvent (C:\\Users\\Abdulaziz\\Desktop\\MyProjects\\iewa\\iewa-backend\\src\\Events\\Monday.events.ts:150:116)\n    at processTicksAndRejections (node:internal/process/task_queues:95:5)\n    at EventSubscribersLoader.wrapFunctionInTryCatchBlocks (C:\\'],
 'Comparison': ['Question: What is the difference between stream position and commit position?',
  'Question: difference between div and main'],
 'Discovery': ['Question: Token ne zaman istenecek? (When will the token be requested?)',
  'Question: Icons'],
 'Off-Topic': ['Question: what is this in 10 words', 'Question: MANT।KUMAR'],
 'Troubleshooting': ['Question: my portainer instance stopped connecting to agent on another instance. Both are on bu do not see each other',
  'Question: I am not getting otp for login ']}