In [2]:
! pip install -U accelerate



In [3]:
!pip install transformers==4.45.2 sentence-transformers==3.1.1



In [4]:
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments, SentencesDataset, losses, CrossEncoder, InputExample, evaluation
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
from datasets import Dataset
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator
from sklearn.model_selection import train_test_split
import torch
import random
from transformers import TrainerCallback

def validation_callback(score, epoch, steps):
    print(f"Epoch {epoch}, Step {steps}: Validation score = {score:.4f}")

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

import torch
torch.cuda.empty_cache()

  from tqdm.autonotebook import tqdm, trange


In [5]:
seed = 42

random.seed(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [6]:
train = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Patents/us-patent-phrase-to-phrase-matching/train.csv')
train['combined'] = train['context'] + ' [SEP] ' +  train['anchor'] # </s>
test = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Patents/us-patent-phrase-to-phrase-matching/test.csv')
test['combined'] = test['context'] + ' [SEP] ' +  test['anchor'] # </s>

train = train.drop(columns = ['id'])
train

Unnamed: 0,anchor,target,context,score,combined
0,abatement,abatement of pollution,A47,0.50,A47 [SEP] abatement
1,abatement,act of abating,A47,0.75,A47 [SEP] abatement
2,abatement,active catalyst,A47,0.25,A47 [SEP] abatement
3,abatement,eliminating process,A47,0.50,A47 [SEP] abatement
4,abatement,forest region,A47,0.00,A47 [SEP] abatement
...,...,...,...,...,...
36468,wood article,wooden article,B44,1.00,B44 [SEP] wood article
36469,wood article,wooden box,B44,0.50,B44 [SEP] wood article
36470,wood article,wooden handle,B44,0.50,B44 [SEP] wood article
36471,wood article,wooden material,B44,0.75,B44 [SEP] wood article


In [6]:
DEBUGE = True

if DEBUGE == True:
    train = train[:10]

In [30]:
train['score'].values#.value_counts()

array([0.5 , 0.75, 0.25, ..., 0.5 , 0.75, 0.5 ])

In [76]:
Train, Val = train_test_split(train, test_size=0.2, random_state=42)

train = [
    InputExample(
        texts=[f"{row['context']}: {row['anchor']}", row['target']], 
        label=row['score']
    ) 
    for _, row in Train.iterrows()
]

val = [
    InputExample(
        texts=[f"{row['context']}: {row['anchor']}", row['target']], 
        label=row['score']
    ) 
    for _, row in Val.iterrows()
]

train_dataloader = DataLoader(train, shuffle=True, batch_size=64)
evaluator = CECorrelationEvaluator.from_input_examples(val, name="val-eval")

model = CrossEncoder('bert-base-uncased', num_labels=1, device='cuda')#CrossEncoder("deepvk/USER-bge-m3")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,  
    epochs=1,
    warmup_steps=100,
    evaluation_steps=100,
    callback=validation_callback,
)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/456 [00:00<?, ?it/s]

Epoch 0, Step 100: Validation score = 0.6111
Epoch 0, Step 200: Validation score = 0.7148
Epoch 0, Step 300: Validation score = 0.7356
Epoch 0, Step 400: Validation score = 0.7481
Epoch 0, Step -1: Validation score = 0.7493


In [84]:
test_data = [[f"{row['context']}: {row['anchor']}", row['target']] for _, row in test.iterrows()]

predictions = model.predict(test_data)
predictions

array([0.50035423, 0.62227666, 0.2462741 , 0.32038915, 0.29248467,
       0.5420276 , 0.3847338 , 0.03904689, 0.19619338, 0.77877873,
       0.29205957, 0.3865197 , 0.53094923, 0.68894434, 0.796887  ,
       0.3746111 , 0.2211302 , 0.0938981 , 0.3784528 , 0.2990763 ,
       0.4139352 , 0.13323483, 0.20715374, 0.3177856 , 0.47404665,
       0.06920658, 0.06518138, 0.02806062, 0.05655253, 0.759483  ,
       0.24089573, 0.0472682 , 0.7164284 , 0.37073332, 0.34768647,
       0.33161643], dtype=float32)

In [None]:
sample = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Patents/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sample['score'] = predictions
sample.to_csv('/home/pret/PycharmProjects/Vseros_classification/Submissions/Patents/submit.csv')

In [7]:
Train, Val = train_test_split(train, test_size=0.2, random_state=42)

train_dataset = Dataset.from_dict({
    "sentence1": [row['combined'] for _, row in train.iterrows()],
    "sentence2": [row['target'] for _, row in train.iterrows()],
    "score": [row['score'] for _, row in train.iterrows()],
})

val_dataset = Dataset.from_dict({
    "sentence1": [row['combined'] for _, row in Val.iterrows()],
    "sentence2": [row['target'] for _, row in Val.iterrows()],
    "score": [row['score'] for _, row in Val.iterrows()],
})

evaluator = evaluation.EmbeddingSimilarityEvaluator(
    val_dataset["sentence1"],
    val_dataset["sentence2"],
    val_dataset["score"]
)

# Функция для вычисления метрик
# def compute_metrics(pred):
#     predictions = pred.predictions[0] if isinstance(pred.predictions, tuple) else pred.predictions
#     labels = pred.label_ids
    
#     # Косинусная схожесть
#     cosine_sim = np.dot(predictions, labels.T) / (np.linalg.norm(predictions, axis=1) * np.linalg.norm(labels, axis=1))
    
#     # Рассчитываем корреляцию Пирсона
#     pearson_corr = np.corrcoef(cosine_sim, labels)[0, 1]
    
#     return {"pearson_corr": pearson_corr}

# Объявление кастомного колбэка для вывода метрик
# class CustomCallback(TrainerCallback):
#     def on_evaluate(self, args, state, control, logs=None, **kwargs):
#         print(f"Validation metrics at step {state.global_step}: {logs}")
    
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if "eval_loss" in logs:
#             print(f"Step {state.global_step} - Eval Loss: {logs['eval_loss']}")
#         else:
#             print(f"Step {state.global_step} - Training Loss: {logs.get('loss', 'N/A')}")
#         if "pearson_corr" in logs:
#             print(f"Step {state.global_step} - Pearson Correlation: {logs['pearson_corr']}")

# Настройки для тренировки
training_args = SentenceTransformerTrainingArguments(
    output_dir="/home/pret/PycharmProjects/Vseros_classification/Models/mpnet-base",
    num_train_epochs=3,
    eval_strategy="steps",
    eval_steps=500,
    logging_dir="/home/pret/PycharmProjects/Vseros_classification/Models/",
    logging_steps=100,
    save_steps=1000,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True
)

# Инициализация модели и loss
model = SentenceTransformer("all-mpnet-base-v2")
loss = losses.CoSENTLoss(model)

# Обучение модели с Trainer
trainer = SentenceTransformerTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    evaluator=evaluator,
 #   compute_metrics=compute_metrics,
    loss=loss,
 #   callbacks=[CustomCallback()],
    args=training_args
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine,Pearson Manhattan,Spearman Manhattan,Pearson Euclidean,Spearman Euclidean,Pearson Dot,Spearman Dot,Pearson Max,Spearman Max
500,6.7858,6.513357,0.772795,0.805656,0.796361,0.804529,0.797919,0.805656,0.772795,0.805656,0.797919,0.805656
1000,6.5297,6.151829,0.829306,0.873796,0.856665,0.873032,0.858287,0.873796,0.829306,0.873795,0.858287,0.873796
1500,6.125,5.785528,0.860601,0.901102,0.884859,0.900015,0.886709,0.901102,0.860601,0.901102,0.886709,0.901102


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1710, training_loss=6.59267731159054, metrics={'train_runtime': 313.0144, 'train_samples_per_second': 349.565, 'train_steps_per_second': 5.463, 'total_flos': 0.0, 'train_loss': 6.59267731159054, 'epoch': 3.0})

In [21]:
a = model.encode('G02 [SEP] opc drum')
b = model.encode('')
model.similarity(a, b)[0][0].item()

0.6438003182411194

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

similarities = [model.similarity(model.encode(row['combined']), model.encode(row['target']))[0][0].item() for _, row in test.iterrows()]
#similarities = [cosine_similarity([model.encode(row['combined'])], [model.encode(row['target'])])[0][0] for _, row in test.iterrows()]
     
sample = pd.read_csv('/home/pret/PycharmProjects/Vseros_classification/Datasets/Patents/us-patent-phrase-to-phrase-matching/sample_submission.csv')
sample['score'] = similarities
sample

Unnamed: 0,id,score
0,4112d61851461f60,0.904539
1,09e418c93a776564,0.971072
2,36baf228038e314b,0.892689
3,1f37ead645e7f0c8,0.82058
4,71a5b6ad068d531f,0.730881
5,474c874d0c07bd21,0.919533
6,442c114ed5c4e3c9,0.880129
7,b8ae62ea5e1d8bdb,0.754825
8,faaddaf8fcba8a3f,0.825281
9,ae0262c02566d2ce,0.978021


In [33]:
model.tokenizer

MPNetTokenizerFast(name_or_path='sentence-transformers/all-mpnet-base-v2', vocab_size=30527, model_max_length=384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '[UNK]', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	104: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	30526: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, nor