In [1]:
import time
import torch
import pandas as pd
from datasets import load_dataset
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
from torch.cuda import is_available

device = 'cuda:1' if is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_custom_dataset(category: str, num_samples: int):
    return load_dataset(
        'csv',
        data_files={
            "train": [f"data/df_{category}_{num_samples}_train.csv"],
            "eval": [f"data/df_{category}_{num_samples}_test.csv"]
        },
        cache_dir="./"
    )

In [3]:
def load_model():
    model_id = "jhgan/ko-sroberta-multitask"
    model = SetFitModel.from_pretrained(model_id)
    model.to(device)
    return model

In [4]:
def load_trainer(model, dataset, category):
    return SetFitTrainer(
        model=model,
        train_dataset=dataset['train'],
        eval_dataset=dataset['eval'],
        loss_class=CosineSimilarityLoss,
        metric="f1",
        batch_size=16,
        num_iterations=20,  # The number of text pairs to generate for contrastive learning
        num_epochs=1,  # The number of epochs to use for contrastive learning
        column_mapping={"text": "text", category: "label"}  # Map dataset columns to text/label expected by trainer
    )

In [5]:
categories = {
    "디자인": "disign",
    "서비스": "service",
    "브랜드제품기타": "brand",
    "가격": "price",
    "품질": "quality"
}

num_samples = 20000

for category, category_en in categories.items():
    contents = {
        "category": [],
        "f1": []
    }

    model = load_model()
    dataset = load_custom_dataset(category=category, num_samples=num_samples)
    print(dataset)
    trainer = load_trainer(model, dataset, category)
    trainer.train()
    metrics = trainer.evaluate()
    contents["category"].append(category)
    contents["f1"].append(metrics.get('f1'))
    print(metrics, '\n')
    df = pd.DataFrame(contents)
    df.to_csv(f"result-{category_en}-{num_samples}.csv")
    model.save_pretrained(f"{category_en}-{num_samples}.pt")

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Found cached dataset csv (/home/shawn/workspace/setfit_modify/csv/default-0afb0212e27b43bf/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 2/2 [00:00<00:00, 566.99it/s]
Applying column mapping to training dataset


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', '가격'],
        num_rows: 20000
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'text', '가격'],
        num_rows: 3040
    })
})


***** Running training *****
  Num examples = 800000
  Num epochs = 1
  Total optimization steps = 50000
  Total train batch size = 16
Iteration: 100%|██████████| 50000/50000 [1:29:10<00:00,  9.34it/s]
Epoch: 100%|██████████| 1/1 [1:29:10<00:00, 5350.98s/it]
Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.9516235917826376} 



model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


Downloading and preparing dataset csv/default to /home/shawn/workspace/setfit_modify/csv/default-89a294f39df41b18/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 2/2 [00:00<00:00, 5928.34it/s]
Extracting data files: 100%|██████████| 2/2 [00:00<00:00, 347.18it/s]
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)
                                                       

Dataset csv downloaded and prepared to /home/shawn/workspace/setfit_modify/csv/default-89a294f39df41b18/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 2/2 [00:00<00:00, 624.90it/s]
Applying column mapping to training dataset


DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', '품질'],
        num_rows: 20000
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'text', '품질'],
        num_rows: 12992
    })
})


***** Running training *****
  Num examples = 800000
  Num epochs = 1
  Total optimization steps = 50000
  Total train batch size = 16
Iteration: 100%|██████████| 50000/50000 [1:29:51<00:00,  9.27it/s]
Epoch: 100%|██████████| 1/1 [1:29:51<00:00, 5391.13s/it]
Applying column mapping to evaluation dataset
***** Running evaluation *****


{'f1': 0.8855621575738869} 

