Faiss + base RAG

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
import faiss
from tqdm.auto import tqdm

INPUT_CSV = "texts.csv"
TEXT_COL = "text"
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LLM_MODEL = "google/flan-t5-small"
BATCH_SIZE = 32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOP_K = 5
OUT_EMB_NPZ = "embeddings_faiss.npz"
INDEX_FILE = "faiss.index"

df = pd.read_csv(INPUT_CSV)
texts = df[TEXT_COL].fillna("").astype(str).tolist()

emb_tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL)
emb_model = AutoModel.from_pretrained(EMB_MODEL).to(DEVICE)
emb_model.eval()

def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    return summed / counts

all_embs = []
with torch.no_grad():
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="embed"):
        batch_texts = texts[i:i+BATCH_SIZE]
        enc = emb_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=256)
        enc = {k: v.to(DEVICE) for k, v in enc.items()}
        out = emb_model(**enc, output_hidden_states=False, return_dict=True)
        if getattr(out, "pooler_output", None) is not None:
            emb = out.pooler_output
        else:
            emb = mean_pooling(out.last_hidden_state, enc["attention_mask"])
        emb = emb.cpu().numpy()
        all_embs.append(emb)
all_embs = np.vstack(all_embs).astype(np.float32)

faiss.normalize_L2(all_embs)
dim = all_embs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(all_embs)
faiss.write_index(index, INDEX_FILE)
np.savez_compressed(OUT_EMB_NPZ, texts=np.array(texts), embeddings=all_embs)

llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL).to(DEVICE)
llm_model.eval()

def retrieve(query, top_k=TOP_K):
    q_enc = emb_tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=256)
    q_enc = {k: v.to(DEVICE) for k, v in q_enc.items()}
    with torch.no_grad():
        out = emb_model(**q_enc, output_hidden_states=False, return_dict=True)
        if getattr(out, "pooler_output", None) is not None:
            q_emb = out.pooler_output.cpu().numpy()
        else:
            q_emb = mean_pooling(out.last_hidden_state, q_enc["attention_mask"]).cpu().numpy()
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)
    ids = I[0].tolist()
    scores = D[0].tolist()
    retrieved_texts = [texts[idx] for idx in ids]
    return retrieved_texts, scores

def answer_query(query, retrieved_texts):
    context = "\n\n".join(f"[{i+1}] {t}" for i, t in enumerate(retrieved_texts))
    prompt = f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    enc = llm_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        out = llm_model.generate(**enc, max_length=256, num_beams=4, early_stopping=True)
    return llm_tokenizer.decode(out[0], skip_special_tokens=True)

example_query = "What is the main topic of the first document?"
retrieved, scores = retrieve(example_query, top_k=TOP_K)
answer = answer_query(example_query, retrieved)
print("Retrieved:")
for t, s in zip(retrieved, scores):
    print(f"score={s:.4f}\t{t[:200]}")
print("Answer:")
print(answer)


Faiss + Reranker + LLM

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import faiss
from tqdm.auto import tqdm

INPUT_CSV = "/kaggle/input/vseros-nlp-qual/train.tsv"
TEXT_COL = "shortDescription"
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
RERANKER_MODEL = "Qwen/Qwen3-Reranker-0.6B"
LLM_MODEL = "google/flan-t5-small"
BATCH_SIZE = 32
RERANK_BATCH = 2
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TOP_K = 5
OUT_EMB_NPZ = "embeddings_faiss.npz"
INDEX_FILE = "faiss.index"

df = pd.read_csv(INPUT_CSV, sep = '\t')
df = df[:100]
texts = df[TEXT_COL].fillna("").astype(str).tolist()

emb_tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL)
emb_model = AutoModel.from_pretrained(EMB_MODEL).to(DEVICE)
emb_model.eval()

def mean_pooling(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-9)
    return summed / counts

all_embs = []
with torch.no_grad():
    for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="embed"):
        batch_texts = texts[i:i+BATCH_SIZE]
        enc = emb_tokenizer(batch_texts, padding=True, truncation=True, return_tensors="pt", max_length=256)
        enc = {k: v.to(DEVICE) for k, v in enc.items()}
        out = emb_model(**enc, output_hidden_states=False, return_dict=True)
        if getattr(out, "pooler_output", None) is not None:
            emb = out.pooler_output
        else:
            emb = mean_pooling(out.last_hidden_state, enc["attention_mask"])
        emb = emb.cpu().numpy()
        all_embs.append(emb)
all_embs = np.vstack(all_embs).astype(np.float32)

faiss.normalize_L2(all_embs)
dim = all_embs.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(all_embs)
faiss.write_index(index, INDEX_FILE)
np.savez_compressed(OUT_EMB_NPZ, texts=np.array(texts), embeddings=all_embs)

reranker_tokenizer = AutoTokenizer.from_pretrained(RERANKER_MODEL, trust_remote_code=True)
reranker_model = AutoModelForSequenceClassification.from_pretrained(RERANKER_MODEL, trust_remote_code=True).to(DEVICE)
if reranker_tokenizer.pad_token is None:
    reranker_tokenizer.pad_token = reranker_tokenizer.eos_token
reranker_model.config.pad_token_id = reranker_tokenizer.pad_token_id
reranker_model.eval()

llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(LLM_MODEL).to(DEVICE)

torch.cuda.empty_cache()
llm_model.eval()
def retrieve(query, top_k=TOP_K):
    q_enc = emb_tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=256)
    q_enc = {k: v.to(DEVICE) for k, v in q_enc.items()}
    with torch.no_grad():
        out = emb_model(**q_enc, output_hidden_states=False, return_dict=True)
        if getattr(out, "pooler_output", None) is not None:
            q_emb = out.pooler_output.cpu().numpy()
        else:
            q_emb = mean_pooling(out.last_hidden_state, q_enc["attention_mask"]).cpu().numpy()
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)
    ids = I[0].tolist()
    scores = D[0].tolist()
    retrieved_texts = [texts[idx] for idx in ids]
    return retrieved_texts, scores, ids

def rerank_scores(query, candidates, batch_size=RERANK_BATCH):
    pairs = [f"Query: {query}\nCandidate: {c}" for c in candidates]
    scores = []
    with torch.no_grad():
        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i+batch_size]
            enc = reranker_tokenizer(batch_pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
            enc = {k: v.to(DEVICE) for k, v in enc.items()}
            out = reranker_model(**enc)
            logits = out.logits
            if logits is None:
                batch_scores = [0.0] * len(batch_pairs)
            else:
                if logits.dim() == 1:
                    batch_scores = logits.cpu().numpy().tolist()
                    batch_scores = [float(torch.sigmoid(torch.tensor(s)).item()) for s in batch_scores]
                elif logits.dim() == 2 and logits.shape[1] == 1:
                    batch_scores = logits.squeeze(1).cpu().numpy().tolist()
                    batch_scores = [float(torch.sigmoid(torch.tensor(s)).item()) for s in batch_scores]
                elif logits.dim() == 2 and logits.shape[1] == 2:
                    probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
                    batch_scores = probs.tolist()
                else:
                    probs = F.softmax(logits, dim=1)[:, -1].cpu().numpy()
                    batch_scores = probs.tolist()
            scores.extend(batch_scores)
    return scores

def combined_rerank(query, candidates, sim_scores, alpha=0.5):
    sim_norm = [ (s + 1.0) / 2.0 for s in sim_scores ]
    rer_scores = rerank_scores(query, candidates)
    combined = []
    for c, s_f, s_r in zip(candidates, sim_norm, rer_scores):
        combined.append((c, (s_f * (1-alpha) + s_r * alpha)))
    combined_sorted = sorted(combined, key=lambda x: x[1], reverse=True)
    texts_sorted = [c for c, sc in combined_sorted]
    scores_sorted = [sc for c, sc in combined_sorted]
    return texts_sorted, scores_sorted, rer_scores

def answer_query(query, retrieved_texts):
    context = "\n\n".join(f"[{i+1}] {t}" for i, t in enumerate(retrieved_texts))
    prompt = f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    enc = llm_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        out = llm_model.generate(**enc, max_length=256, num_beams=4, early_stopping=True)
    return llm_tokenizer.decode(out[0], skip_special_tokens=True)

example_query = "I want to play the game with Motorcycles?"
retrieved, sim_scores, ids = retrieve(example_query, top_k=TOP_K)
combined_texts, combined_scores, rer_scores = combined_rerank(example_query, retrieved, sim_scores, alpha=0.5)
answer = answer_query(example_query, combined_texts[:TOP_K])
print("Retrieved (FAISS):")
for t, s in zip(retrieved, sim_scores):
    print(f"score={s:.4f}\t{t[:200]}")
print("\nCombined (FAISS+Reranker):")
for t, s in zip(combined_texts[:TOP_K], combined_scores[:TOP_K]):
    print(f"score={s:.4f}\t{t[:200]}")
print("\nAnswer:")
print(answer)

and variant without blending scores

In [None]:
torch.cuda.empty_cache()
llm_model.eval()
def retrieve(query, top_k=TOP_K):
    q_enc = emb_tokenizer(query, padding=True, truncation=True, return_tensors="pt", max_length=256)
    q_enc = {k: v.to(DEVICE) for k, v in q_enc.items()}
    with torch.no_grad():
        out = emb_model(**q_enc, output_hidden_states=False, return_dict=True)
        if getattr(out, "pooler_output", None) is not None:
            q_emb = out.pooler_output.cpu().numpy()
        else:
            q_emb = mean_pooling(out.last_hidden_state, q_enc["attention_mask"]).cpu().numpy()
    faiss.normalize_L2(q_emb)
    D, I = index.search(q_emb, top_k)
    ids = I[0].tolist()
    scores = D[0].tolist()
    retrieved_texts = [texts[idx] for idx in ids]
    return retrieved_texts, scores, ids

def rerank(query, candidates, batch_size=RERANK_BATCH):
    pairs = [f"Query: {query}\nCandidate: {c}" for c in candidates]
    scores = []
    with torch.no_grad():
        for i in range(0, len(pairs), batch_size):
            batch_pairs = pairs[i:i+batch_size]
            enc = reranker_tokenizer(batch_pairs, padding=True, truncation=True, return_tensors="pt", max_length=512)
            enc = {k: v.to(DEVICE) for k, v in enc.items()}
            out = reranker_model(**enc)
            logits = out.logits
            if logits.dim() == 1:
                batch_scores = logits.cpu().numpy().tolist()
            elif logits.dim() == 2 and logits.shape[1] == 1:
                batch_scores = logits.squeeze(1).cpu().numpy().tolist()
            else:
                probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
                batch_scores = probs.tolist()
            scores.extend(batch_scores)
    ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
    reranked_texts = [r[0] for r in ranked]
    reranked_scores = [r[1] for r in ranked]
    return reranked_texts, reranked_scores

def answer_query(query, retrieved_texts):
    context = "\n\n".join(f"[{i+1}] {t}" for i, t in enumerate(retrieved_texts))
    prompt = f"Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {query}\nAnswer:"
    enc = llm_tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512).to(DEVICE)
    with torch.no_grad():
        out = llm_model.generate(**enc, max_length=256, num_beams=4, early_stopping=True)
    return llm_tokenizer.decode(out[0], skip_special_tokens=True)

example_query = "Give me the name of the game for playing with cars, i need name of the game, not a number"
retrieved, sim_scores, ids = retrieve(example_query, top_k=TOP_K)
reranked_texts, reranked_scores = rerank(example_query, retrieved)
answer = answer_query(example_query, reranked_texts[:TOP_K])
print("Retrieved (FAISS):")
for t, s in zip(retrieved, sim_scores):
    print(f"score={s:.4f}\t{t[:200]}")
print("\nReranked:")
for t, s in zip(reranked_texts[:TOP_K], reranked_scores[:TOP_K]):
    print(f"score={s:.4f}\t{t[:200]}")
print("\nAnswer:")
print(answer)

LoRa

In [None]:
import os
import math
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from peft import LoraConfig, get_peft_model, TaskType

MODEL_NAME = "gpt2"
DATA_CSV = "sft_data.csv"
PROMPT_COL = "prompt"
RESPONSE_COL = "response"
OUTPUT_DIR = "lora_finetuned"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 3
BATCH_SIZE = 8
LR = 3e-4
MAX_LENGTH = 512
WEIGHT_DECAY = 0.0
GRAD_ACCUM = 1
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

df = pd.read_csv(DATA_CSV)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["c_attn","c_proj"]
)
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()
model.to(DEVICE)

class SFTDataset(Dataset):
    def __init__(self, dataframe, tokenizer, prompt_col="prompt", response_col="response", max_length=512):
        self.tokenizer = tokenizer
        self.prompts = dataframe[prompt_col].fillna("").astype(str).tolist()
        self.responses = dataframe[response_col].fillna("").astype(str).tolist()
        self.max_length = max_length
    def __len__(self):
        return len(self.prompts)
    def __getitem__(self, idx):
        prompt = self.prompts[idx]
        response = self.responses[idx]
        prompt_ids = self.tokenizer(prompt, truncation=True, max_length=self.max_length, return_tensors=None)["input_ids"]
        full = prompt + response
        enc = self.tokenizer(full, truncation=True, max_length=self.max_length, return_tensors=None)
        input_ids = enc["input_ids"]
        labels = input_ids.copy()
        p_len = len(prompt_ids)
        for i in range(min(p_len, len(labels))):
            labels[i] = -100
        return {"input_ids": torch.tensor(input_ids, dtype=torch.long), "labels": torch.tensor(labels, dtype=torch.long)}

def collate_fn(batch):
    input_ids = [b["input_ids"] for b in batch]
    labels = [b["labels"] for b in batch]
    max_len = max([t.size(0) for t in input_ids])
    padded_inputs = torch.full((len(input_ids), max_len), tokenizer.pad_token_id, dtype=torch.long)
    padded_labels = torch.full((len(labels), max_len), -100, dtype=torch.long)
    attention_mask = torch.zeros((len(input_ids), max_len), dtype=torch.long)
    for i, (inp, lab) in enumerate(zip(input_ids, labels)):
        l = inp.size(0)
        padded_inputs[i, :l] = inp
        padded_labels[i, :l] = lab
        attention_mask[i, :l] = 1
    return {"input_ids": padded_inputs, "attention_mask": attention_mask, "labels": padded_labels}

train_frac = 0.9
n = len(df)
idx = list(range(n))
random.shuffle(idx)
split = int(train_frac * n)
train_idx = idx[:split]
val_idx = idx[split:]
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

train_dataset = SFTDataset(train_df, tokenizer, PROMPT_COL, RESPONSE_COL, max_length=MAX_LENGTH)
val_dataset = SFTDataset(val_df, tokenizer, PROMPT_COL, RESPONSE_COL, max_length=MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = math.ceil(len(train_loader) * EPOCHS / GRAD_ACCUM)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.06 * total_steps), num_training_steps=total_steps)

model.train()
global_step = 0
for epoch in range(EPOCHS):
    for step, batch in enumerate(train_loader):
        input_ids = batch["input_ids"].to(DEVICE)
        attention_mask = batch["attention_mask"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss / GRAD_ACCUM
        loss.backward()
        if (step + 1) % GRAD_ACCUM == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            global_step += 1
    model.eval()
    total_loss = 0.0
    nb = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(DEVICE)
            attention_mask = batch["attention_mask"].to(DEVICE)
            labels = batch["labels"].to(DEVICE)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item() * input_ids.size(0)
            nb += input_ids.size(0)
    val_loss = total_loss / nb if nb > 0 else 0.0
    model.train()

os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)


https://www.kaggle.com/code/fuumin621/qwen2-5-lora-finetune-baseline-training

https://github.com/unslothai/unsloth

https://blog.deepschool.ru/llm/rag-ot-pervoj-versii-k-rabochemu-resheniyu/

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_DIR = "/path/to/model_folder"  # содержит config.json и pytorch_model.bin / *.safetensors и токенизатор

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_DIR,
    device_map="auto",              # распределит по GPU/CPU если доступно
    torch_dtype=torch.float16       # если модель fp16
)
model.eval()

texts = ["Hello world"]
enc = tokenizer(texts, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    out = model.generate(**enc, max_new_tokens=64, do_sample=False)
print(tokenizer.batch_decode(out, skip_special_tokens=True))