In [1]:
!pip install transformers datasets accelerate scikit-learn evaluate wandb bitsandbytes peft --quiet

[0m

### Data preparation and splitting 

In [1]:
import pandas as pd
import json


with open("data.json", "r") as f:
    data = json.load(f)["data"]

df = pd.DataFrame(data)

df.head()

Unnamed: 0,category,question,answer
0,Troubleshooting,Build Error\nWhen I try to import an image sta...,Next.js handles static image imports slightly ...
1,Discovery,How can I specify my own character to separate...,"I'm sorry, but based on the provided knowledge..."
2,Discovery,When can I buy Tari?,"As of the latest updates, the Tari main net ha..."
3,Off-Topic,Invest,"I'm sorry, but your question is a bit vague. I..."
4,Discovery,I there a way to define a global variable that...,"Yes, in Mage, you can define global variables ..."


In [2]:
df["category"].value_counts()

category
Discovery          734
Troubleshooting    199
Off-Topic           83
Code                59
Advice              53
Comparison          50
Off-topic            7
Name: count, dtype: int64

In [3]:
# there are 2 identical categories: "Off-topic" and "Off-Topic"
# rename to "Off-Topic"

df["category"] = df["category"].str.replace("Off-topic", "Off-Topic")

df["category"].value_counts()

category
Discovery          734
Troubleshooting    199
Off-Topic           90
Code                59
Advice              53
Comparison          50
Name: count, dtype: int64

In [4]:
# given labels are not equally distributed, split by category with stratify

from sklearn.model_selection import train_test_split


train, test = train_test_split(
    df, test_size=0.15, stratify=df["category"], random_state=42, shuffle=True
)
train, val = train_test_split(
    train, test_size=0.15, stratify=train["category"], random_state=42, shuffle=True
)

train.head()

Unnamed: 0,category,question,answer
72,Code,can you please convert this sample code in jav...,"Sure, I can help you convert the JavaScript co..."
351,Discovery,how can i get mask all coordinate,"Hmm, I don't know enough to give you a confide..."
212,Discovery,if a vulnerable package is already cached insi...,"No, if a vulnerable package is already cached ..."
17,Discovery,how to get SMS for login,FusionAuth supports SMS-based multi-factor aut...
7,Discovery,Has anyone mentioned that the changelog or wee...,"I'm sorry, but the provided knowledge sources ..."


In [5]:
# reset index and save to parquet

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

# for each row, add to question column f"Question: {row['question']}" and answer column f"Answer: {row['answer']}" and create a new column combined question and answer in one column

train["question"] = train["question"].apply(lambda x: f"Question: {x}")
train["answer"] = train["answer"].apply(lambda x: f"Answer: {x}")

val["question"] = val["question"].apply(lambda x: f"Question: {x}")
val["answer"] = val["answer"].apply(lambda x: f"Answer: {x}")

test["question"] = test["question"].apply(lambda x: f"Question: {x}")
test["answer"] = test["answer"].apply(lambda x: f"Answer: {x}")

train["combined"] = train["question"] + "\n" + train["answer"]
val["combined"] = val["question"] + "\n" + val["answer"]
test["combined"] = test["question"] + "\n" + test["answer"]


train.to_parquet("train.parquet")
val.to_parquet("val.parquet")
test.to_parquet("test.parquet")

In [1]:
# read train and val from parquet

import pandas as pd


train = pd.read_parquet("train.parquet")
val = pd.read_parquet("val.parquet")
test = pd.read_parquet("test.parquet")

# change column name from category to label
train = train.rename(columns={"category": "label"})
val = val.rename(columns={"category": "label"})
test = test.rename(columns={"category": "label"})

train["label"].value_counts()

label
Discovery          530
Troubleshooting    143
Off-Topic           65
Code                42
Advice              38
Comparison          37
Name: count, dtype: int64

In [2]:
# get nan values for train and val and test

print(train.isna().sum())
print(val.isna().sum())
print(test.isna().sum())

label       0
question    0
answer      0
combined    0
dtype: int64
label       0
question    0
answer      0
combined    0
dtype: int64
label       0
question    0
answer      0
combined    0
dtype: int64


In [3]:
test.head()

Unnamed: 0,label,question,answer,combined
0,Discovery,"Question: OAP在Receiver模式下, 必须要配置成集群模式吗?",Answer: 是的，OAP在Receiver模式下应该配置成集群模式。在集群模式下，所有的...,"Question: OAP在Receiver模式下, 必须要配置成集群模式吗?\nAnswe..."
1,Troubleshooting,Question: Build Error\nWhen I try to import an...,Answer: Next.js handles static image imports s...,Question: Build Error\nWhen I try to import an...
2,Off-Topic,Question: Hello there,Answer: Hello! How can I assist you with Typef...,Question: Hello there\nAnswer: Hello! How can ...
3,Off-Topic,Question: hi,Answer: Hello! How can I assist you with kapa....,Question: hi\nAnswer: Hello! How can I assist ...
4,Discovery,Question: creating a new project in amplitude,Answer: Creating a new project in Amplitude in...,Question: creating a new project in amplitude ...


In [4]:
from datasets import Dataset

label2id = {
    "Discovery": 0,
    "Troubleshooting": 1,
    "Off-Topic": 2,
    "Code": 3,
    "Advice": 4,
    "Comparison": 5,
}

train_dataset = Dataset.from_pandas(train, split="train").class_encode_column("label")
class_label_feature = train_dataset.features["label"]
val_dataset = Dataset.from_pandas(val, split="val").cast_column(
    "label", class_label_feature
)
test_dataset = Dataset.from_pandas(test, split="test").cast_column(
    "label", class_label_feature
)

Casting to class labels:   0%|          | 0/855 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/152 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/178 [00:00<?, ? examples/s]

In [5]:
label2id = train_dataset.features["label"]._str2int
id2label = {v: k for k, v in label2id.items()}

id2label

{0: 'Advice',
 1: 'Code',
 2: 'Comparison',
 3: 'Discovery',
 4: 'Off-Topic',
 5: 'Troubleshooting'}

In [6]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true
%env WANDB_LOG_MODEL=end
%env WANDB_PROJECT=kapa_question_type_classification

config = {
    "model_name": "meta-llama/Meta-Llama-3-8B",
    "input_type": "question",
    "version": "1",
    "batch_size": 16,
    "train_epochs": 100,
    "num_workers": 8,
    "lr": 1e-5 ,
    #"dropout": 0.3
}





env: WANDB_WATCH=all
env: WANDB_SILENT=true
env: WANDB_LOG_MODEL=end
env: WANDB_PROJECT=kapa_question_type_classification


In [7]:
from transformers import AutoTokenizer


if "llama" in config["model_name"]:
    tokenizer = AutoTokenizer.from_pretrained(
        config["model_name"],
        token="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",
        add_prefix_space=True,
    )
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.pad_token = tokenizer.eos_token

else:
    tokenizer = AutoTokenizer.from_pretrained(
        config["model_name"],
        token="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",
    )


def preprocess_function(examples):
    return tokenizer(examples[config["input_type"]], truncation=True)


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

In [8]:
tokenizer.decode(tokenized_test[0]["input_ids"])

'<|begin_of_text|>Question: OAP在Receiver模式下, 必须要配置成集群模式吗?'

In [9]:
tokenized_test[0]["question"]

'Question: OAP在Receiver模式下, 必须要配置成集群模式吗?'

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
import evaluate
import numpy as np


def compute_metrics(eval_preds):
    metric = evaluate.combine(["f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metrics = metric.compute(
        predictions=predictions, references=labels, average="weighted"
    )

    # F1 Score per label
    f1_metric = evaluate.load("f1")
    f1_per_label = f1_metric.compute(
        predictions=predictions, references=labels, average=None
    )
    f1_per_label_dict = {id2label[idx]: v for idx, v in enumerate(f1_per_label["f1"])}

    return {
        "f1": metrics["f1"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_per_label": f1_per_label_dict,
    }

In [12]:
from transformers import (
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
import os
import torch

os.environ["TOKENIZERS_PARALLELISM"] = "false"

if "llama" in config["model_name"]:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        config["model_name"],
        num_labels=6,
        id2label=id2label,
        label2id=label2id,
        token="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",
        quantization_config=quantization_config,
        device_map="auto",
    )


else:

    model = AutoModelForSequenceClassification.from_pretrained(
        config["model_name"],
        num_labels=6,
        id2label=id2label,
        label2id=label2id,
        token="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",
    )

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [14]:
# import torch.nn as nn

# model.dropout = nn.Dropout(config["dropout"])

In [15]:
def count_trainable_parameters(model):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    return params


count_trainable_parameters(model)

13656064

In [16]:
from transformers import EarlyStoppingCallback
from transformers import TrainingArguments, Trainer

early_stopping = EarlyStoppingCallback(early_stopping_patience=7)

args = TrainingArguments(
    f"{config['model_name']}-v-{config['version']}",
    evaluation_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=config["lr"],
    per_device_train_batch_size=config["batch_size"],
    per_device_eval_batch_size=config["batch_size"],
    num_train_epochs=config["train_epochs"],
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    push_to_hub=False,
    fp16=True,
    fp16_full_eval=True,
    # bf16_full_eval=True,
    # bf16=True,
    save_total_limit=1,
    gradient_checkpointing=True,
    optim="adamw_torch",
    report_to="wandb",
    lr_scheduler_type="cosine",
    warmup_ratio=0.01,
    logging_strategy="epoch",
    run_name=f"{config['model_name']}-v-{config['version']}",
    dataloader_num_workers=config["num_workers"],
)



In [17]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
    data_collator=data_collator,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Precision,Recall,F1 Per Label
1,3.3536,2.517055,0.352346,0.411936,0.315789,"{'Advice': 0.0, 'Code': 0.12903225806451613, 'Comparison': 0.10526315789473684, 'Discovery': 0.524390243902439, 'Off-Topic': 0.0, 'Troubleshooting': 0.1}"
2,2.5663,1.958161,0.493073,0.457018,0.552632,"{'Advice': 0.0, 'Code': 0.2222222222222222, 'Comparison': 0.0, 'Discovery': 0.7476635514018691, 'Off-Topic': 0.0, 'Troubleshooting': 0.1111111111111111}"
3,1.9985,1.663658,0.571038,0.551156,0.611842,"{'Advice': 0.0, 'Code': 0.26666666666666666, 'Comparison': 0.0, 'Discovery': 0.780952380952381, 'Off-Topic': 0.23529411764705882, 'Troubleshooting': 0.3333333333333333}"
4,1.558,1.445163,0.614202,0.601202,0.657895,"{'Advice': 0.0, 'Code': 0.15384615384615385, 'Comparison': 0.18181818181818182, 'Discovery': 0.8075117370892019, 'Off-Topic': 0.25, 'Troubleshooting': 0.47619047619047616}"
5,1.1862,1.259983,0.649441,0.636122,0.690789,"{'Advice': 0.0, 'Code': 0.3076923076923077, 'Comparison': 0.2222222222222222, 'Discovery': 0.8301886792452831, 'Off-Topic': 0.35294117647058826, 'Troubleshooting': 0.5}"
6,0.881,1.145774,0.675524,0.662727,0.703947,"{'Advice': 0.0, 'Code': 0.46153846153846156, 'Comparison': 0.4, 'Discovery': 0.8252427184466019, 'Off-Topic': 0.42105263157894735, 'Troubleshooting': 0.5531914893617021}"
7,0.6543,1.066433,0.689768,0.678642,0.717105,"{'Advice': 0.0, 'Code': 0.46153846153846156, 'Comparison': 0.36363636363636365, 'Discovery': 0.8349514563106796, 'Off-Topic': 0.5263157894736842, 'Troubleshooting': 0.5652173913043478}"



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Acces

In [18]:
import wandb

wandb.config.update(config)
wandb.finish()

In [None]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
inputs = tokenizer(text, return_tensors="pt")

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
with torch.no_grad():
    logits = model(**inputs).logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]