In [None]:
!pip install pyyaml
!pip install pandas
!pip install transformers datasets torch accelerate
!pip install datasets
!pip install jsonlines
!pip install rank_bm25


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [2]:
import random
import os
import pandas as pd
import yaml
import json
from datasets import Dataset, concatenate_datasets
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Указываем директории
json_directory = "/content/drive/My Drive/out"
yaml_directory = "/content/drive/My Drive/processes_yaml_2"
output_file = "/content/drive/My Drive/jina/positive_pairs.jsonl"

# Список JSONL-файлов для обработки
file_names = [
    "batch_gpt-4o-mini_0_200.jsonl",
    "batch_gpt-4o-mini_200_400.jsonl",
    "batch_gpt-4o-mini_400_600.jsonl",
    "batch_gpt-4o-mini_600_800.jsonl",
    "batch_gpt-4o-mini_800_1000.jsonl"
]
pairs = []

# Словарь для хранения результатов (ключ: query, значение: список текстов)
result = {}

# Обработка каждого JSONL-файла
for file_name in file_names:
    # Строим полный путь к JSONL-файлу
    json_file_path = os.path.join(json_directory, file_name)
    print(f"Обработка файла: {json_file_path}")

    # Проверяем, существует ли файл
    if not os.path.exists(json_file_path):
        print(f"Файл {json_file_path} не найден. Пропускаем.")
        continue

    # Открываем JSONL-файл и читаем построчно
    with open(json_file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Парсим JSON-строку
            try:
                data = json.loads(line.strip())
            except json.JSONDecodeError as e:
                print(f"Ошибка при чтении строки: {line.strip()}. Ошибка: {e}")
                continue

            # Извлекаем UUID
            uuid = data.get("process_uuid")
            if not uuid:
                print(f"UUID не найден в строке: {data}")
                continue

            # Выбираем случайный ключ из basic_query, general_query, specific_query
            query_keys = ["basic_query", "general_query", "specific_query"]
            selected_key = random.choice(query_keys)
            query = data.get(selected_key, "")

            # Ищем соответствующий YAML-файл
            yaml_file_path = os.path.join(yaml_directory, f"{uuid}.yaml")
            yaml_text = ""

            if os.path.exists(yaml_file_path):
                # Читаем YAML-файл
                with open(yaml_file_path, "r", encoding="utf-8") as yaml_file:
                    try:
                        yaml_data = yaml.safe_load(yaml_file)
                        # Преобразуем YAML-данные в текстовый формат
                        yaml_text = yaml.dump(yaml_data, allow_unicode=True)
                    except yaml.YAMLError as e:
                        print(f"Ошибка при чтении YAML-файла {yaml_file_path}: {e}")
            else:
                print(f"YAML-файл для UUID {uuid} не найден: {yaml_file_path}")

           # Добавляем пару {"text1": query, "text2": text} в список
            if query and yaml_text:  # Проверяем, что query и текст не пустые
                pairs.append({"anchor": query, "positive": yaml_text.replace("\'", " ").replace("\"", " ")})

# Сохранение результата в формате JSON Lines
with open(output_file, "w", encoding="utf-8") as file:
    for pair in pairs:
        file.write(json.dumps(pair, ensure_ascii=False) + "\n")

print(f"Результат сохранен в файл {output_file}.")

Обработка файла: /content/drive/My Drive/out/batch_gpt-4o-mini_0_200.jsonl
Обработка файла: /content/drive/My Drive/out/batch_gpt-4o-mini_200_400.jsonl
Обработка файла: /content/drive/My Drive/out/batch_gpt-4o-mini_400_600.jsonl
Обработка файла: /content/drive/My Drive/out/batch_gpt-4o-mini_600_800.jsonl
Обработка файла: /content/drive/My Drive/out/batch_gpt-4o-mini_800_1000.jsonl
Результат сохранен в файл /content/drive/My Drive/jina/positive_pairs.jsonl.


In [None]:
from datasets import load_dataset, Dataset, concatenate_datasets

# Пути к файлам
similar_data_path = "/content/drive/My Drive/jina/positive_pairs.jsonl"  # Похожие пары

# Загружаем данные
similar_dataset = load_dataset("json", data_files=similar_data_path, split="train")
similar_dataset = similar_dataset.shuffle(seed=42)

# Разделение на обучающую и валидационную выборки
train_test_split = similar_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

# Проверяем данные
print(train_dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

{'anchor': 'calcium oxide manufacturing', 'positive': 'Class:\n- Herstellung von Glas und Glaswaren, Keramik, Verarbeitung von Steinen und Erden\n  / Herstellung von Zement, Kalk und gebranntem Gips / Herstellung von gebranntem\n  Gips\nGeography: DE\nMain Output:\n- 1 kg  Branntkalk (CaO)\n- Main Output Flow: Flow( lime (CaO)  output, 1 kg (Mass), type= Product flow , class= Systems\n    / Other systems )\nName: Steine-Erden\\CaO-mix-DE-2050\nYear:\n-  2050 \n'}


In [None]:
import os
import torch
from datasets import load_dataset, Dataset
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
    SentenceTransformerModelCardData,
)
from sentence_transformers.losses import MultipleNegativesRankingLoss, ContrastiveLoss
from sentence_transformers.training_args import BatchSamplers

torch.cuda.empty_cache()

# Установка переменной окружения для управления памятью
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# 1. Загрузка модели для дообучения
model = SentenceTransformer(
    "jinaai/jina-embeddings-v2-base-de",
    trust_remote_code=True,
    model_card_data=SentenceTransformerModelCardData(
        language="de",
        license="apache-2.0",
        model_name="jina-embeddings-v2-base-de trained on custom pairs",
    ),
)
model.max_seq_length = 768
model_name = "jina-v2-base-custom-pairs"

# 2. Загрузка данных из JSONL-файлов
similar_data_path = "/content/drive/My Drive/jina/positive_pairs.jsonl"

# Загрузка данных
similar_dataset = load_dataset('json', data_files=similar_data_path)
print(f"Similar dataset size: {len(similar_dataset['train'])}")
train_dataset = similar_dataset["train"]

Generating train split: 0 examples [00:00, ? examples/s]

Similar dataset size: 3000


In [None]:
split_dataset = train_dataset.train_test_split(test_size=0.2, seed=42)  # seed для воспроизводимости

train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Проверяем размеры
print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

# 3. Определение функции потерь
loss = MultipleNegativesRankingLoss(model)

Train dataset size: 2400
Eval dataset size: 600


In [None]:
print(train_dataset[1])

{'anchor': 'aluminum production', 'positive': 'Class:\n- Metallerzeugung und -bearbeitung / Erzeugung und erste Bearbeitung von NE-Metallen\n  / Erzeugung und erste Bearbeitung von Aluminium\nGeography: DE\nMain Output:\n- 1 kg  Aluminium\n- Main Output Flow: Flow( aluminium  output, 1 kg (Mass), type= Product flow , class= Systems\n    / Other systems )\nName: Metall\\Aluminium-mix-DE-2015\nTechnology:\n-  Mix zur Aufteilung der Primäraluminiumherstellung bezgl. Nachfrage BRD.\\nAllokation:\\\n  \\ keine\\nGenese der Daten: Aus #1 geht hervor, daß im Jahr 1994 die Primäraluminiumproduktion\\\n  \\ der Bundesrepublik (ca. 0,5 Mio t) nur ein Drittel des inländischen Verbrauchs\\\n  \\ (ca. 1,5 Mio t) abdeckte. Die Statistik zeigt ferner, daß die Direktimporte der\\\n  \\ BRD von Primäraluminium auf mehrere Dutzend Länder verteilt sind, wobei ein Schwerpunkt\\\n  \\ auf  West- und Osteuropa liegt. Von einer Berücksichtigung der Primärproduktion\\\n  \\ dieser Staaten wurde Abstand genomm

In [None]:
# 4. Определение аргументов обучения
args = SentenceTransformerTrainingArguments(
    output_dir=f"models/{model_name}",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=1e-5,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    # save_total_limit=2,
    logging_steps=100,
    logging_first_step=True,
    run_name=model_name,
    weight_decay=0.1,
)

# 5. Создание тренера и обучение
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,  # Обучающая часть
    eval_dataset=eval_dataset,  # Оценочная часть
    loss=loss,
)

# Обучение модели
trainer.train()

# 6. Сохранение модели
save_path = "/content/drive/My Drive/jina/tuning_param_jina_fine_tuned"
model.save_pretrained(save_path)



eval_results = trainer.evaluate(eval_dataset=eval_dataset)
print("Evaluation results:")
print(f"Evaluation loss: {eval_results['eval_loss']}")


Step,Training Loss,Validation Loss
100,0.0124,0.138371
