In [1]:

%%capture
!pip install datasets mteb

In [2]:
from datasets import load_dataset

# GLUE에서 MNLI 데이터셋을 로드합니다.
# 0 = 수반, 1 = 중립, 2 = 모순
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

In [3]:
from sentence_transformers import SentenceTransformer

# BERT 베이스 모델을 사용합니다.
embedding_model = SentenceTransformer('bert-base-uncased')



In [4]:

from sentence_transformers import losses

# 손실 함수를 정의합니다. 소프트맥스 손실을 위해 명시적으로 레이블의 개수를 지정해야 합니다.
train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)

In [5]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# STSB를 위해 임베딩 유사도 평가자를 만듭니다.
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
    similarity_fn_names=["cosine", "euclidean", "manhattan", "dot"]
)

In [6]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# 훈련 매개변수를 정의합니다.
args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

In [7]:

from sentence_transformers.trainer import SentenceTransformerTrainer

# 임베딩 모델을 훈련합니다.
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0741
200,0.9439
300,0.8884
400,0.844
500,0.8302
600,0.8267
700,0.8087
800,0.7887
900,0.773
1000,0.7684


TrainOutput(global_step=1563, training_loss=0.8124786542915642, metrics={'train_runtime': 310.6076, 'train_samples_per_second': 160.975, 'train_steps_per_second': 5.032, 'total_flos': 0.0, 'train_loss': 0.8124786542915642, 'epoch': 1.0})

In [8]:
evaluator(embedding_model)

{'pearson_cosine': 0.5331466559843028,
 'spearman_cosine': 0.5967158406255931,
 'pearson_euclidean': 0.5680242156929675,
 'spearman_euclidean': 0.5944779600776491,
 'pearson_manhattan': 0.5777038567858184,
 'spearman_manhattan': 0.5978652826999742,
 'pearson_dot': 0.5057131124841571,
 'spearman_dot': 0.5410485642557797,
 'pearson_max': 0.5777038567858184,
 'spearman_max': 0.5978652826999742}

In [9]:
from mteb import MTEB

# 평가 작업을 선택합니다.
evaluation = MTEB(tasks=["Banking77Classification"])

# 결과를 계산합니다.
results = evaluation.run(embedding_model)
results



ValidationError: 1 validation error for ModelMeta
languages
  Value error, not enough values to unpack (expected 2, got 1) [type=value_error, input_value=['en'], input_type=list]
    For further information visit https://errors.pydantic.dev/2.11/v/value_error

In [14]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [20]:
from datasets import Dataset,load_dataset

train_dataset = load_dataset("glue","mnli",split = "train").select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

mapping = {2:0,1:0,0:1}

train_dataset = Dataset.from_dict({
    "sentence1":train_dataset["premise"],
    "sentence2":train_dataset["hypothesis"],
    "label":[float(mapping[label]) for label in train_dataset["label"]]
})

In [21]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# STSB를 위한 임베딩 유사도 평가자를 만듭니다.
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
    similarity_fn_names=["cosine", "euclidean", "manhattan", "dot"]
)

In [22]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# 모델
embedding_model = SentenceTransformer('bert-base-uncased')

# 손실 함수
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# 훈련 매개변수
args = SentenceTransformerTrainingArguments(
    output_dir="base_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

# 모델 훈련
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2325
200,0.1706
300,0.1721
400,0.1598
500,0.1531
600,0.1594
700,0.1513
800,0.1561
900,0.1474
1000,0.1478


TrainOutput(global_step=1563, training_loss=0.1573614467052184, metrics={'train_runtime': 316.3851, 'train_samples_per_second': 158.035, 'train_steps_per_second': 4.94, 'total_flos': 0.0, 'train_loss': 0.1573614467052184, 'epoch': 1.0})

In [23]:
# 훈련된 모델을 평가합니다.
evaluator(embedding_model)

{'pearson_cosine': 0.7280159513154746,
 'spearman_cosine': 0.730839025712125,
 'pearson_euclidean': 0.740970026543814,
 'spearman_euclidean': 0.7389411362856845,
 'pearson_manhattan': 0.7412202492496264,
 'spearman_manhattan': 0.7393051899177047,
 'pearson_dot': 0.6727015457261541,
 'spearman_dot': 0.6734432014051451,
 'pearson_max': 0.7412202492496264,
 'spearman_max': 0.7393051899177047}

In [25]:

import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [24]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

mnli = load_dataset("glue", "mnli", split="train").select(range(50_000))
mnli = mnli.remove_columns("idx")
mnli = mnli.filter(lambda x: True if x['label'] == 0 else False)

train_dataset = {"anchor": [], "positive": [], "negative": []}
soft_negatives = mnli["hypothesis"]
random.shuffle(soft_negatives)
for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset["anchor"].append(row["premise"])
    train_dataset["positive"].append(row["hypothesis"])
    train_dataset["negative"].append(soft_negative)
train_dataset = Dataset.from_dict(train_dataset)
len(train_dataset)

16875it [00:01, 15193.62it/s]


16875

In [26]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# STSB를 위해 임베딩 유사도 평가자를 만듭니다.
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
    similarity_fn_names=["cosine", "euclidean", "manhattan", "dot"]
)

In [27]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# 모델
embedding_model = SentenceTransformer('bert-base-uncased')

# 손실 함수
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# 훈련 매개변수
args = SentenceTransformerTrainingArguments(
    output_dir="mnrloss_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

# 모델 훈련
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.3292
200,0.1061
300,0.0763
400,0.0627
500,0.0688


TrainOutput(global_step=528, training_loss=0.12481096319176933, metrics={'train_runtime': 135.5581, 'train_samples_per_second': 124.485, 'train_steps_per_second': 3.895, 'total_flos': 0.0, 'train_loss': 0.12481096319176933, 'epoch': 1.0})

In [28]:
# 훈련된 모델을 평가합니다.
evaluator(embedding_model)

{'pearson_cosine': 0.808470579569148,
 'spearman_cosine': 0.8105882035236845,
 'pearson_euclidean': 0.8228737930534832,
 'spearman_euclidean': 0.8170865176167813,
 'pearson_manhattan': 0.8227436791532433,
 'spearman_manhattan': 0.8171403486147848,
 'pearson_dot': 0.74697460924984,
 'spearman_dot': 0.7339588054451895,
 'pearson_max': 0.8228737930534832,
 'spearman_max': 0.8171403486147848}

In [29]:

import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [30]:
from datasets import load_dataset
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# GLUE에서 MNLI 데이터셋을 로드합니다.
# 0 = 수반, 1 = 중립, 2 = 모순
train_dataset = load_dataset("glue", "mnli", split="train").select(range(50_000))
train_dataset = train_dataset.remove_columns("idx")

# STSB를 위해 임베딩 유사도 평가자를 만듭니다.
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
    similarity_fn_names=["cosine", "euclidean", "manhattan", "dot"]
)

In [31]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# 모델
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 손실 함수
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# 훈련 매개변수
args = SentenceTransformerTrainingArguments(
    output_dir="finetuned_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

# 모델 훈련
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,0.1573
200,0.1105
300,0.1199
400,0.1188
500,0.1083
600,0.1011
700,0.1196
800,0.0987
900,0.1041
1000,0.1052


TrainOutput(global_step=1563, training_loss=0.10938254702342906, metrics={'train_runtime': 93.7355, 'train_samples_per_second': 533.416, 'train_steps_per_second': 16.675, 'total_flos': 0.0, 'train_loss': 0.10938254702342906, 'epoch': 1.0})

In [32]:
evaluator(embedding_model)

{'pearson_cosine': 0.8495060616360056,
 'spearman_cosine': 0.84888622639484,
 'pearson_euclidean': 0.8525534019383613,
 'spearman_euclidean': 0.84888622639484,
 'pearson_manhattan': 0.8516566173119285,
 'spearman_manhattan': 0.8481567027597154,
 'pearson_dot': 0.8495060585577587,
 'spearman_dot': 0.84888622639484,
 'pearson_max': 0.8525534019383613,
 'spearman_max': 0.84888622639484}

In [33]:
original_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
evaluator(original_model)

{'pearson_cosine': 0.8696194532655239,
 'spearman_cosine': 0.8671631197908374,
 'pearson_euclidean': 0.8678715917211143,
 'spearman_euclidean': 0.8671631197908374,
 'pearson_manhattan': 0.8670398993426292,
 'spearman_manhattan': 0.8663946139224048,
 'pearson_dot': 0.869619453465613,
 'spearman_dot': 0.8671631197908374,
 'pearson_max': 0.869619453465613,
 'spearman_max': 0.8671631197908374}

In [34]:
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, Dataset
from sentence_transformers import InputExample
from sentence_transformers.datasets import NoDuplicatesDataLoader

dataset = load_dataset("glue", "mnli", split="train").select(range(10_000))
mapping = {2: 0, 1: 0, 0:1}

gold_examples = [
    InputExample(texts=[row["premise"], row["hypothesis"]], label=mapping[row["label"]])
    for row in tqdm(dataset)
]
gold_dataloader = NoDuplicatesDataLoader(gold_examples, batch_size=32)
gold = pd.DataFrame(
    {
    'sentence1': dataset['premise'],
    'sentence2': dataset['hypothesis'],
    'label': [mapping[label] for label in dataset['label']]
    }
)

100%|██████████| 10000/10000 [00:00<00:00, 28880.76it/s]


In [38]:
import os
os.environ["WANDB_MODE"] = "disabled"

In [41]:
from sentence_transformers.cross_encoder import CrossEncoder

# 골드 데이터셋에서 크로스 인코더를 훈련합니다.
cross_encoder = CrossEncoder('bert-base-uncased', num_labels=2)
cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


In [44]:
silver = load_dataset("glue", "mnli", split="train").select(range(10_000, 50_000))
pairs = list(zip(silver['premise'], silver['hypothesis']))

In [45]:
import numpy as np

# 미세 튜닝된 크로스 인코더를 사용해 문장 쌍에 레이블을 할당합니다.
output = cross_encoder.predict(pairs, apply_softmax=True,
                               show_progress_bar=True)
silver = pd.DataFrame(
    {
        "sentence1": silver["premise"],
        "sentence2": silver["hypothesis"],
        "label": np.argmax(output, axis=1)
    }
)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

In [46]:
data = pd.concat([gold, silver], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

In [47]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# STSB를 위한 임베딩 유사도 평가자를 만듭니다.
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
    similarity_fn_names=["cosine", "euclidean", "manhattan", "dot"]
)

In [48]:
from sentence_transformers import losses, SentenceTransformer
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# 모델
embedding_model = SentenceTransformer('bert-base-uncased')

# 손실 함수
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# 훈련 매개변수
args = SentenceTransformerTrainingArguments(
    output_dir="augmented_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

# 모델 훈련
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2164
200,0.1593
300,0.1471
400,0.1422
500,0.1398
600,0.136
700,0.1345
800,0.1373
900,0.1342
1000,0.133


TrainOutput(global_step=1563, training_loss=0.1410853144303393, metrics={'train_runtime': 312.6878, 'train_samples_per_second': 159.897, 'train_steps_per_second': 4.999, 'total_flos': 0.0, 'train_loss': 0.1410853144303393, 'epoch': 1.0})

In [49]:
evaluator(embedding_model)


{'pearson_cosine': 0.704614240559309,
 'spearman_cosine': 0.7137668270976784,
 'pearson_euclidean': 0.7243127511484728,
 'spearman_euclidean': 0.7224954665733628,
 'pearson_manhattan': 0.7244276815549119,
 'spearman_manhattan': 0.7229148705302998,
 'pearson_dot': 0.6560921552623229,
 'spearman_dot': 0.6581639702202152,
 'pearson_max': 0.7244276815549119,
 'spearman_max': 0.7229148705302998}

In [52]:
trainer.accelerator.clear()

[]

In [50]:
data = pd.concat([gold], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep="first")
train_dataset = Dataset.from_pandas(data, preserve_index=False)

# 모델
embedding_model = SentenceTransformer('bert-base-uncased')

# 손실 함수
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# 훈련 매개변수
args = SentenceTransformerTrainingArguments(
    output_dir="gold_only_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

# 모델 훈련
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()



Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
100,0.2268
200,0.1714
300,0.16


TrainOutput(global_step=313, training_loss=0.18524515514556592, metrics={'train_runtime': 62.1577, 'train_samples_per_second': 160.881, 'train_steps_per_second': 5.036, 'total_flos': 0.0, 'train_loss': 0.18524515514556592, 'epoch': 1.0})

In [51]:
evaluator(embedding_model)

{'pearson_cosine': 0.6209082962469472,
 'spearman_cosine': 0.6476197138331911,
 'pearson_euclidean': 0.6507275662728851,
 'spearman_euclidean': 0.660126546307706,
 'pearson_manhattan': 0.6525194330292942,
 'spearman_manhattan': 0.6616165544534143,
 'pearson_dot': 0.5484171045884517,
 'spearman_dot': 0.546236683013587,
 'pearson_max': 0.6525194330292942,
 'spearman_max': 0.6616165544534143}

In [53]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [54]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [56]:
from tqdm import tqdm
from datasets import Dataset, load_dataset
from sentence_transformers.datasets import DenoisingAutoEncoderDataset

mnli = load_dataset("glue","mnli",split="train").select(range(25_000))
flat_sentences = mnli["premise"]+mnli["hypothesis"]

damaged_data = DenoisingAutoEncoderDataset(list(set(flat_sentences)))

train_dataset = {"damaged_sentence":[],"original_sentence":[]}
for data in tqdm(damaged_data):
  train_dataset["damaged_sentence"].append(data.texts[0])
  train_dataset["original_sentence"].append(data.texts[1])
train_dataset = Dataset.from_dict(train_dataset)

100%|██████████| 48353/48353 [00:08<00:00, 5632.51it/s]


In [57]:

train_dataset[0]

{'damaged_sentence': 'was Johnson, trying to Kennedy out him',
 'original_sentence': 'This was Johnson at his most evocative, trying to ease Kennedy out of his way while sucking up to him.'}

In [58]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# STSB를 위한 임베딩 유사도 평가자를 만듭니다.
val_sts = load_dataset('glue', 'stsb', split='validation')
evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts["sentence1"],
    sentences2=val_sts["sentence2"],
    scores=[score/5 for score in val_sts["label"]],
    main_similarity="cosine",
    similarity_fn_names=["cosine", "euclidean", "manhattan", "dot"]
)

In [59]:
from sentence_transformers import models, SentenceTransformer

# 임베딩 모델을 만듭니다.
word_embedding_model = models.Transformer('bert-base-uncased')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
embedding_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

In [60]:
from sentence_transformers import losses

# 잡음제거 오토 인코더 손실
train_loss = losses.DenoisingAutoEncoderLoss(
    embedding_model, tie_encoder_decoder=True
)
train_loss.decoder = train_loss.decoder.to("cuda")

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [61]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir="tsdae_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

In [62]:
from sentence_transformers.trainer import SentenceTransformerTrainer
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# 훈련 매개변수
args = SentenceTransformerTrainingArguments(
    output_dir="tsdae_embedding_model",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100,
    report_to=[]
)

# 모델 훈련
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)
trainer.train()

Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss
100,7.1578
200,4.9332
300,4.6254
400,4.4809
500,4.3869
600,4.2941
700,4.2151
800,4.1648
900,4.0798
1000,4.0767


TrainOutput(global_step=3023, training_loss=4.046167164814224, metrics={'train_runtime': 899.6803, 'train_samples_per_second': 53.745, 'train_steps_per_second': 3.36, 'total_flos': 0.0, 'train_loss': 4.046167164814224, 'epoch': 1.0})

In [63]:
evaluator(embedding_model)

{'pearson_cosine': 0.7338751127785723,
 'spearman_cosine': 0.7413590222959303,
 'pearson_euclidean': 0.7355969003015073,
 'spearman_euclidean': 0.7381553706188639,
 'pearson_manhattan': 0.73555541544949,
 'spearman_manhattan': 0.738128216451968,
 'pearson_dot': 0.6415048620695478,
 'spearman_dot': 0.6393363824939999,
 'pearson_max': 0.7355969003015073,
 'spearman_max': 0.7413590222959303}