text cls

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

model_name = "cointegrated/rubert-tiny2"  # или ваша модель
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def predict(text):
    return classifier(text)[0]

qa

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

def predict_qa(context, question, model_name="deepset/roberta-base-squad2"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    start_idx = torch.argmax(start_scores, dim=1).item()
    end_idx = torch.argmax(end_scores, dim=1).item()

    answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx+1])
    return answer

get embed

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embeddings(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # CLS или mean pooling
    return embeddings

txt similarity

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-MiniLM-L6-v2')

def similarity(text1, text2):
    emb1 = model.encode([text1])
    emb2 = model.encode([text2])
    return cosine_similarity(emb1, emb2)[0][0]

text2json

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def generate_json(text):
    input_text = f"parse: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

zero shot prompting

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

def classify(text, labels):
    result = classifier(text, candidate_labels=labels)
    return result

rag1

In [None]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
from transformers import DPRQuestionEncoder, DPRContextEncoder
import faiss
import numpy as np

class RAGPipeline:
    def __init__(self, rag_model_name="facebook/rag-sequence-base"):
        self.tokenizer = RagTokenizer.from_pretrained(rag_model_name)
        self.retriever = RagRetriever.from_pretrained(rag_model_name, index_name="exact", use_dummy_dataset=True)
        self.generator = RagSequenceForGeneration.from_pretrained(rag_model_name)

    def generate(self, question):
        inputs = self.tokenizer(question, return_tensors="pt")
        with torch.no_grad():
            outputs = self.generator.generate(**inputs)
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

semantic txt similarity

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def compute_sts_score(text1, text2, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    emb1 = model.encode([text1])
    emb2 = model.encode([text2])
    score = cosine_similarity(emb1, emb2)[0][0]
    return score

cls paraphrase detection

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

def predict_paraphrase(text1, text2, model_name="sbert-base-uncased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    inputs = tokenizer(text1, text2, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1)
    return probs

zero shot cls

In [None]:
from transformers import pipeline

def few_shot_classify(text, labels, model_name="gpt2"):
    # GPT-2 не имеет классификации, но можно использовать инференс через промпт
    # Или использовать `text-generation` с кастомным промптом
    classifier = pipeline("text-generation", model=model_name, tokenizer=model_name)
    prompt = f"Classify the following text into one of these categories: {', '.join(labels)}.\nText: {text}\nCategory:"
    result = classifier(prompt, max_length=100, num_return_sequences=1)
    return result[0]['generated_text']

qlora + peft

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import TrainingArguments, Trainer
import torch

def setup_lora_training(model_name, dataset):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto"
    )

    model = prepare_model_for_kbit_training(model)
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir="./lora_results",
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        logging_dir="./logs",
        save_steps=100,
        logging_steps=10,
        report_to=None
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
    )

    trainer.train()
    model.save_pretrained("./lora_adapted")

fully shared data parallel

In [None]:
import torch
import torch.distributed as dist
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
from torch.distributed.fsdp import MixedPrecision
from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
from transformers.models.gpt2.modeling_gpt2 import GPT2Block

def setup_fsdp(model):
    auto_wrap_policy = transformer_auto_wrap_policy(
        transformer_layer_cls={GPT2Block}
    )
    mp_policy = MixedPrecision(
        param_dtype=torch.float16,
        reduce_dtype=torch.float16,
        buffer_dtype=torch.float16
    )
    model = FSDP(
        model,
        auto_wrap_policy=auto_wrap_policy,
        mixed_precision=mp_policy,
        device_id=torch.cuda.current_device()
    )
    return model

deepseak no, deepseed

In [None]:
from transformers import Trainer, TrainingArguments

def setup_deepspeed(model, dataset, output_dir="./deepspeed_results"):
    ds_config = {
        "zero_optimization": {
            "stage": 2,
            "offload_param": {"device": "cpu", "pin_memory": True},
            "offload_optimizer": {"device": "cpu", "pin_memory": True}
        },
        "fp16": {"enabled": True}
    }

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=1,
        deepspeed=ds_config,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    trainer.train()

inf + cache

In [None]:
import pickle
import os
from hashlib import md5

def get_cached_embeddings(texts, model, cache_path="embeddings.pkl"):
    key = md5(str(texts).encode()).hexdigest()
    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            cache = pickle.load(f)
        if key in cache:
            return cache[key]

    embeddings = get_embeddings(texts)  # функция из предыдущего примера

    if os.path.exists(cache_path):
        with open(cache_path, "rb") as f:
            cache = pickle.load(f)
    else:
        cache = {}

    cache[key] = embeddings

    with open(cache_path, "wb") as f:
        pickle.dump(cache, f)

    return embeddings

faiss

In [None]:
import faiss
import numpy as np

def build_faiss_index(embeddings):
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    faiss.normalize_L2(embeddings)
    index.add(embeddings.astype('float32'))
    return index

def search_faiss(query_embedding, index, k=5):
    faiss.normalize_L2(query_embedding)
    scores, indices = index.search(query_embedding.astype('float32'), k)
    return scores, indices

rag2

In [None]:
from transformers import DPRQuestionEncoder, DPRContextEncoder, DPRReader
import torch

class DPRRAG:
    def __init__(self, question_model, context_model, reader_model):
        self.question_encoder = DPRQuestionEncoder.from_pretrained(question_model)
        self.context_encoder = DPRContextEncoder.from_pretrained(context_model)
        self.reader = DPRReader.from_pretrained(reader_model)
        self.index = None
        self.contexts = []

    def encode_contexts(self, contexts):
        self.contexts = contexts
        embeddings = []
        for ctx in contexts:
            ctx_emb = self.context_encoder(ctx, return_tensors="pt").pooler_output.detach().numpy()
            embeddings.append(ctx_emb)
        embeddings = np.vstack(embeddings)
        self.index = build_faiss_index(embeddings)

    def retrieve(self, query, k=5):
        query_emb = self.question_encoder(query, return_tensors="pt").pooler_output.detach().numpy()
        scores, indices = search_faiss(query_emb, self.index, k=k)
        return [self.contexts[i] for i in indices[0]]

    def read(self, question, context):
        inputs = self.reader(question=question, titles=[""], texts=[context], return_tensors="pt")
        outputs = self.reader(**inputs)
        start_idx = outputs.start_logits.argmax().item()
        end_idx = outputs.end_logits.argmax().item()
        answer = self.reader.convert_tokens_to_string(
            inputs.input_ids[0][start_idx:end_idx+1]
        )
        return answer

subform

In [None]:
import json
import pandas as pd

def create_submission(predictions, output_path="submission.json"):
    """
    predictions: list of answers in order of questions
    output_path: path to save JSON submission
    """
    submission = {"answers": predictions}
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(submission, f, ensure_ascii=False, indent=2)
    print(f"Submission saved to {output_path}")

def load_sample_submission(path="sample_submission.json"):
    with open(path, "r", encoding="utf-8") as f:
        sample = json.load(f)
    return sample

def match_submission_to_sample(sample_path, predictions):
    sample = load_sample_submission(sample_path)
    sample["answers"] = predictions
    return sample

def save_submission_with_sample(sample_path, predictions, output_path="submission.json"):
    submission = match_submission_to_sample(sample_path, predictions)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(submission, f, ensure_ascii=False, indent=2)
    print(f"Submission saved to {output_path}")

In [None]:
def run_rag_pipeline(queries, contexts, output_path="submission.json"):
    rag = DPRRAG(
        question_model="facebook/dpr-question_encoder-single-nq-base",
        context_model="facebook/dpr-ctx_encoder-single-nq-base",
        reader_model="facebook/dpr-reader-single-nq-base"
    )
    rag.encode_contexts(contexts)

    answers = []
    for query in queries:
        retrieved = rag.retrieve(query, k=1)[0]
        answer = rag.read(query, retrieved)
        answers.append(answer)

    save_submission_with_sample("sample_submission.json", answers, output_path)
