In [15]:
!pip install transformers datasets accelerate scikit-learn evaluate --quiet

[0m

### Data preparation and splitting 

In [1]:
import pandas as pd
import json


with open("data.json", "r") as f:
    data = json.load(f)["data"]

df = pd.DataFrame(data)

df.head()

Unnamed: 0,category,question,answer
0,Troubleshooting,Build Error\nWhen I try to import an image sta...,Next.js handles static image imports slightly ...
1,Discovery,How can I specify my own character to separate...,"I'm sorry, but based on the provided knowledge..."
2,Discovery,When can I buy Tari?,"As of the latest updates, the Tari main net ha..."
3,Off-Topic,Invest,"I'm sorry, but your question is a bit vague. I..."
4,Discovery,I there a way to define a global variable that...,"Yes, in Mage, you can define global variables ..."


In [2]:
df["category"].value_counts()

category
Discovery          734
Troubleshooting    199
Off-Topic           83
Code                59
Advice              53
Comparison          50
Off-topic            7
Name: count, dtype: int64

In [3]:
# there are 2 identical categories: "Off-topic" and "Off-Topic"
# rename to "Off-Topic"

df["category"] = df["category"].str.replace("Off-topic", "Off-Topic")

df["category"].value_counts()

category
Discovery          734
Troubleshooting    199
Off-Topic           90
Code                59
Advice              53
Comparison          50
Name: count, dtype: int64

In [4]:
# given labels are not equally distributed, split by category with stratify

from sklearn.model_selection import train_test_split


train, test = train_test_split(
    df, test_size=0.15, stratify=df["category"], random_state=42, shuffle=True
)
train, val = train_test_split(
    train, test_size=0.15, stratify=train["category"], random_state=42, shuffle=True
)

train.head()

Unnamed: 0,category,question,answer
72,Code,can you please convert this sample code in jav...,"Sure, I can help you convert the JavaScript co..."
351,Discovery,how can i get mask all coordinate,"Hmm, I don't know enough to give you a confide..."
212,Discovery,if a vulnerable package is already cached insi...,"No, if a vulnerable package is already cached ..."
17,Discovery,how to get SMS for login,FusionAuth supports SMS-based multi-factor aut...
7,Discovery,Has anyone mentioned that the changelog or wee...,"I'm sorry, but the provided knowledge sources ..."


In [5]:
# reset index and save to parquet

train = train.reset_index(drop=True)
val = val.reset_index(drop=True)
test = test.reset_index(drop=True)

# for each row, add to question column f"Question: {row['question']}" and answer column f"Answer: {row['answer']}" and create a new column combined question and answer in one column

train["question"] = train["question"].apply(lambda x: f"Question: {x}")
train["answer"] = train["answer"].apply(lambda x: f"Answer: {x}")

val["question"] = val["question"].apply(lambda x: f"Question: {x}")
val["answer"] = val["answer"].apply(lambda x: f"Answer: {x}")

test["question"] = test["question"].apply(lambda x: f"Question: {x}")
test["answer"] = test["answer"].apply(lambda x: f"Answer: {x}")

train["combined"] = train["question"] + "\n" + train["answer"]
val["combined"] = val["question"] + "\n" + val["answer"]
test["combined"] = test["question"] + "\n" + test["answer"]


train.to_parquet("train.parquet")
val.to_parquet("val.parquet")
test.to_parquet("test.parquet")

In [1]:
# read train and val from parquet

import pandas as pd


train = pd.read_parquet("train.parquet")
val = pd.read_parquet("val.parquet")
test = pd.read_parquet("test.parquet")

train["category"].value_counts()

category
Discovery          530
Troubleshooting    143
Off-Topic           65
Code                42
Advice              38
Comparison          37
Name: count, dtype: int64

In [2]:
# get nan values for train and val and test

print(train.isna().sum())
print(val.isna().sum())
print(test.isna().sum())

category    0
question    0
answer      0
combined    0
dtype: int64
category    0
question    0
answer      0
combined    0
dtype: int64
category    0
question    0
answer      0
combined    0
dtype: int64


In [3]:
test.head()

Unnamed: 0,category,question,answer,combined
0,Discovery,"Question: OAP在Receiver模式下, 必须要配置成集群模式吗?",Answer: 是的，OAP在Receiver模式下应该配置成集群模式。在集群模式下，所有的...,"Question: OAP在Receiver模式下, 必须要配置成集群模式吗?\nAnswe..."
1,Troubleshooting,Question: Build Error\nWhen I try to import an...,Answer: Next.js handles static image imports s...,Question: Build Error\nWhen I try to import an...
2,Off-Topic,Question: Hello there,Answer: Hello! How can I assist you with Typef...,Question: Hello there\nAnswer: Hello! How can ...
3,Off-Topic,Question: hi,Answer: Hello! How can I assist you with kapa....,Question: hi\nAnswer: Hello! How can I assist ...
4,Discovery,Question: creating a new project in amplitude,Answer: Creating a new project in Amplitude in...,Question: creating a new project in amplitude ...


In [4]:
from datasets import Dataset

label2id = {
    "Discovery": 0,
    "Troubleshooting": 1,
    "Off-Topic": 2,
    "Code": 3,
    "Advice": 4,
    "Comparison": 5,
}

train_dataset = Dataset.from_pandas(train, split="train").class_encode_column(
    "category"
)
class_label_feature = train_dataset.features["category"]
val_dataset = Dataset.from_pandas(val, split="val").cast_column(
    "category", class_label_feature
)
test_dataset = Dataset.from_pandas(test, split="test").cast_column(
    "category", class_label_feature
)

Casting to class labels:   0%|          | 0/855 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/152 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/178 [00:00<?, ? examples/s]

In [5]:
label2id = train_dataset.features["category"]._str2int
id2label = {v: k for k, v in label2id.items()}

id2label

{0: 'Advice',
 1: 'Code',
 2: 'Comparison',
 3: 'Discovery',
 4: 'Off-Topic',
 5: 'Troubleshooting'}

In [6]:
config = {
    "model_name": "bert-base-multilingual-cased",
    "input_type": "question",
}

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config["model_name"])


def preprocess_function(examples):
    return tokenizer(examples[config["input_type"]], truncation=True)


tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



Map:   0%|          | 0/855 [00:00<?, ? examples/s]

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

In [8]:
tokenizer.decode(tokenized_test[0]["input_ids"])

'[CLS] Question : OAP 在 Receiver 模 式 下, 必 须 要 配 置 成 集 群 模 式 吗? [SEP]'

In [9]:
tokenized_test[0]["question"]

'Question: OAP在Receiver模式下, 必须要配置成集群模式吗?'

In [10]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
import evaluate
import numpy as np


def compute_metrics(eval_preds):
    metric = evaluate.combine(["f1", "precision", "recall"])
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    metrics = metric.compute(
        predictions=predictions, references=labels, average="weighted"
    )

    # F1 Score per label
    f1_metric = evaluate.load("f1")
    f1_per_label = f1_metric.compute(
        predictions=predictions, references=labels, average=None
    )
    f1_per_label_dict = {id2label[idx]: v for idx, v in enumerate(f1_per_label["f1"])}

    return {
        "f1": metrics["f1"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_per_label": f1_per_label_dict,
    }

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    config["model_name"],
    num_labels=6,
    id2label=id2label,
    label2id=label2id,
)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]