In [None]:
pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [None]:

pip install -U langchain langchain-openai

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'api_key'

In [None]:
os.environ['OPENAI_API_KEY'] = 'api_key'

In [None]:
os.environ['MISTRAL_API_KEY'] = 'api_key'

In [None]:
os.environ['HUGGINGFACEHUB_API_TOKEN'] = 'api_key'

In [None]:
os.environ['GOOGLE_API_KEY'] = 'api_key'

In [None]:
github_token = "api_key"

In [None]:
pip install PyGithub

In [None]:
pip install faiss-cpu


In [None]:
pip install faiss-cpu

## Github repo feldolgozása és indexelés

In [8]:
import requests
from github import Github
from bs4 import BeautifulSoup
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
import faiss
import numpy as np

In [9]:
#GitHub Repoból kiszedi a szükséges dolgokat (kód, dokumentáció)
def clone_repo(github_token, repo_name):
    g = Github(github_token)
    repo = g.get_repo(repo_name)
    
    contents = repo.get_contents("")
    
    files = []
    while contents:
        file_content = contents.pop(0)
        if file_content.type == "dir":
            contents.extend(repo.get_contents(file_content.path))
        else:
            if file_content.name.endswith(('.md', '.py', '.txt', '.json', '.yml', '.c', '.h', '.cpp', '.hpp')): # Ezeket a fájlokat fogadja el egyenlőre
                file_data = {
                    "file_name": file_content.name,
                    "file_content": requests.get(file_content.download_url).text
                }
                files.append(file_data)
    
    return files

In [10]:
#A Issue-k kiszedése a repo-ból
def fetch_issues(github_token, repo_name):
    g = Github(github_token)
    repo = g.get_repo(repo_name)
    
    issues = []
    for issue in repo.get_issues(state='open'):
        issue_data = {
            "file_name": f"issue_{issue.number}",
            "file_content": f"Title: {issue.title}\nDescription: {issue.body}"
        }
        issues.append(issue_data)
    
    return issues

In [11]:
#A fájlok kisebb darabokra bontása
def preprocess_files_old(files):
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
    documents = []
    
    for file in files:
        content = file['file_content']
        
        #Markdown fájlok esetén HTML kód eltávolítása
        if file['file_name'].endswith(('.md', '.html')):
            soup = BeautifulSoup(content, 'html.parser')
            content = soup.get_text()
        
        chunks = text_splitter.split_text(content)
        
        for chunk in chunks:
            documents.append(chunk)
    
    return documents

In [12]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

EXTENSION_LANGUAGE_MAP = {
    '.py': Language.PYTHON,
    '.cpp': Language.CPP,
    '.c': Language.C,
    '.cs': Language.CSHARP,
    '.md': Language.MARKDOWN,
    '.html': Language.HTML,
}

def get_language_from_filename(filename):
    for ext, lang in EXTENSION_LANGUAGE_MAP.items():
        if filename.endswith(ext):
            return lang
    return None

def preprocess_files(files):
    documents = []
    
    for file in files:
        filename = file['file_name']
        content = file['file_content']

        if filename.endswith(('.md', '.html')):
            soup = BeautifulSoup(content, 'html.parser')
            content = soup.get_text()

        lang = get_language_from_filename(filename)

        if lang:
            splitter = RecursiveCharacterTextSplitter.from_language(
                language=lang, chunk_size=500, chunk_overlap=0
            )
        else:
            splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

        docs = splitter.create_documents([content])
        documents.extend(doc.page_content for doc in docs)

    return documents


In [13]:
#Embeddingek generálása  OpenAI embeddings-el és FAISS indexeléssel
def create_embeddings(documents):
    embeddings = OpenAIEmbeddings()
    
    doc_embeddings = embeddings.embed_documents(documents)
    doc_embeddings_np = np.array(doc_embeddings)
    
    index = faiss.IndexFlatL2(doc_embeddings_np.shape[1])
    index.add(doc_embeddings_np)
    
    return index, doc_embeddings_np, documents

In [None]:
#Egy konkrét repora alkalmazva a fenti funkciókat

github_token = "api_key"
repo_name = "pydantic/pydantic"
    
files = clone_repo(github_token, repo_name)
    
issues = fetch_issues(github_token, repo_name)
    
all_documents = files + issues

In [None]:
documents = preprocess_files(all_documents)
    
index, doc_embeddings_np, documents = create_embeddings(documents)

## Retrieval

In [246]:
#A kérdés (Query) feldolgozása és a legrelevánsabb dokumentumok visszaadása
def retrieve_relevant_document(query, index, documents, embeddings, k=5):

    query_embedding = embeddings.embed_query(query)
    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    D, I = index.search(query_embedding_np, k)
    
    relevant_docs = [documents[i] for i in I[0]]
    return relevant_docs

In [None]:
pip install rank-bm25

In [None]:
from rank_bm25 import BM25Okapi

#Szöveget kisbetűssé alakítja és szóközök mentén feldarabolja
def simple_tokenize(text):
    return text.lower().split()

#BM25
def build_bm25_index(documents):
    tokenized_docs = [simple_tokenize(doc) for doc in documents]
    bm25 = BM25Okapi(tokenized_docs)
    return bm25, tokenized_docs

#Keresés BM25-el
def retrieve_relevant_document_bm25(query, bm25, documents, tokenized_docs, k=5):
    tokenized_query = simple_tokenize(query)
    scores = bm25.get_scores(tokenized_query)
    top_k_indices = np.argsort(scores)[::-1][:k]
    relevant_docs = [documents[i] for i in top_k_indices]
    return relevant_docs


In [None]:
from sklearn.preprocessing import MinMaxScaler

# Hybrid keresés BM25 és embeddingek kombinálásával
def retrieve_relevant_document_hybrid(query, bm25, tokenized_docs, embedding_index, embeddings, documents, k=5, alpha=0.5):
    
    #BM25
    tokenized_query = simple_tokenize(query)
    bm25_scores = bm25.get_scores(tokenized_query)

    #Embedding
    query_embedding = embeddings.embed_query(query)
    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    D, I = embedding_index.search(query_embedding_np, len(documents))
    embedding_scores = np.zeros(len(documents))
    for idx, doc_idx in enumerate(I[0]):
        embedding_scores[doc_idx] = 1.0 / (1.0 + D[0][idx])  # convert distance to similarity

    #Normalizálás
    scaler = MinMaxScaler()
    bm25_scores_norm = scaler.fit_transform(bm25_scores.reshape(-1, 1)).flatten()
    embedding_scores_norm = scaler.fit_transform(embedding_scores.reshape(-1, 1)).flatten()

    # Kombinálás
    combined_scores = alpha * bm25_scores_norm + (1 - alpha) * embedding_scores_norm
    top_k_indices = np.argsort(combined_scores)[::-1][:k]

    relevant_docs = [documents[i] for i in top_k_indices]
    return relevant_docs


In [25]:
#Példa kérdés
query = "Does this program support Python 3.14?"
relevant_docs = retrieve_relevant_document(query, index, documents, MistralAIEmbeddings(mistral_api_key=os.environ["MISTRAL_API_KEY"]))
    
print(f"5 válasz legrevelánsabb dokumentum:")
for doc in relevant_docs:
        print(f"- {doc}")



5 válasz legrevelánsabb dokumentum:
- Title: Add support for Python 3.14
Description: First 3.14 beta release is [planned on 2025-05-06](https://peps.python.org/pep-0745/#release-schedule) and PEP 649/749 is almost fully implemented.

Considering the significant changes it provides to the runtime evaluation of type hints, we should add support to 3.14 and report any bugs/issues.
- python version: 3.10.11
```
- python version: 3.11.5 | packaged by conda-forge | (main, Aug 27 2023, 03:34:09) [GCC 12.3.0]
platform: Linux-3.10.0-1160.45.1.el7.x86_64-x86_64-with-glibc2.17
related packages: typing_extensions-4.8.0
```

Update 10/03/23:  Modified the script to be able to run it as a single file
- ### Python, Pydantic & OS Version

```Text
2.11.0b2 but also at least 2.7

OS: Apple M2 Mac
```
- python version: 3.10.13 (main, Sep 11 2023, 13:44:35) [GCC 11.2.0]
                     platform: Linux-5.15.0-91-generic-x86_64-with-glibc2.35
             related packages: typing_extensions-4.10.0 myp

## Generation

In [None]:
pip install -U langchain-google-genai


In [18]:
from langchain_mistralai import ChatMistralAI
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

llm_mistral = ChatMistralAI(model_name="mistral-small", temperature=0)

In [106]:
llm_mistral = ChatMistralAI(model_name="mistral-small", temperature=0)

In [19]:
chain = prompt | llm_mistral

In [109]:
chain.invoke({"context":relevant_docs,"question":"Does this program support Python 3.14?"})

AIMessage(content='The context does not provide information about the version compatibility of the program with Python 3.14.', additional_kwargs={}, response_metadata={'token_usage': {'prompt_tokens': 35, 'total_tokens': 57, 'completion_tokens': 22}, 'model_name': 'mistral-small', 'model': 'mistral-small', 'finish_reason': 'stop'}, id='run-35ac8675-3036-4d3d-9dbc-8c6867d11f57-0', usage_metadata={'input_tokens': 35, 'output_tokens': 22, 'total_tokens': 57})

## RAFT

## Adathalmaz készítése a RAFT-hoz

In [114]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm_gemini = ChatGoogleGenerativeAI(model="gemini-2.5-flash-preview-04-17", temperature=0)

In [130]:
import os
import random
import time
from langchain.prompts import PromptTemplate


#Kérdés-válasz párok generálása
def generate_qa(context):
    prompt = PromptTemplate(
        input_variables=["context"],
        template="You are given context from a software project such as code, documentation, or comments. Generate a useful and self-contained Q&A pair that reflects a realistic question someone might ask about how the software works, how to use it, or how its designed. Focus on generating meaningful technical questions that are relevant to developers, maintainers, or users. The format should be the following:Q: <question>\nA: <answer>, the context:\n{context}\n",
    )
    
    chain = prompt | llm_gemini

    response = chain.invoke({"context":context})

    response = response.content
    try:
        question_answer = response.strip().split("\nA: ")
        question = question_answer[0].replace("Q: ", "").strip()
        answer = question_answer[1].strip()
    except:
        question, answer = "Nem sikerült kérdést generálni", "Nem sikerült választ generálni"

    return question, answer

#A kérdés-válasz párok generálása az összes repo-ra
def create_qa_pairs(github_token, repo_list, n_pairs_per_repo=3):
    repo_qa_data = {}

    for repo_name in repo_list:
        print(f"Feldolgozás alatt: {repo_name}")

        files = clone_repo(github_token, repo_name)
        issues = fetch_issues(github_token, repo_name)

        print(f"Feldolgozva: {repo_name}")
        
        all_texts = files + issues

        
        random.shuffle(all_texts) # Randomizálás, hogy random kérdéseket generáljon, ne csak a legelső fájlakból
        
        qa_pairs = []
        
        for file_data in all_texts:
            if len(qa_pairs) >= n_pairs_per_repo: # Ha elérte a kívánt kérdés-válasz párok számát, akkor kilép
                break
            context = file_data["file_content"]

            context = context[:4000]  #4000 karakterre vágás
            
            question, answer = generate_qa(context)
            
            if question and answer:
                qa_pairs.append({
                    "question": question,
                    "answer": answer
                })
            
            time.sleep(5)
        
        print(qa_pairs)

        repo_qa_data[repo_name] = qa_pairs
    
    return repo_qa_data


In [133]:
repos = ["pydantic/pydantic", "pallets/flask", "gandalfcode/gandalf", "fmtlib/fmt", "prettytable/prettytable", "cookiecutter/cookiecutter", "git/git", "sqlite/sqlite", "bitcoin/bitcoin", "rom1v/sndcpy"]
repo_qa_data  = create_qa_pairs(github_token, repos, n_pairs_per_repo=20)

Feldolgozás alatt: pydantic/pydantic
Feldolgozva: pydantic/pydantic
Feldolgozás alatt: pallets/flask
Feldolgozva: pallets/flask
Feldolgozás alatt: gandalfcode/gandalf
Feldolgozva: gandalfcode/gandalf
[{'question': 'What is the purpose of the `soundwave-L1error.py` script and what simulation methods are being compared?', 'answer': 'This script is designed to demonstrate the performance scaling of a simulation method (likely related to gravitational or SPH forces) with respect to the number of particles (`Nhydro`). It compares two approaches: a "bruteforce" method and a "kdtree" method. The script runs simulations using each method for varying `Nhydro`, measures the execution time, and plots the results on a log-log scale alongside theoretical O(N), O(N^2), and O(N log N) scaling lines to show how the computational cost grows with the number of particles. The "bruteforce" and "kdtree" methods likely refer to different algorithms used for tasks like neighbor searching or force calculation

Error in LangChainTracer.on_chain_start callback: ValueError('I/O operation on closed file.')
Error in LangChainTracer.on_chain_start callback: ValueError('I/O operation on closed file.')
Error in LangChainTracer.on_chain_end callback: ValueError('I/O operation on closed file.')


[{'question': "Why does the `LLVMFuzzerTestOneInput` function initialize repository settings directly in memory instead of reading them from the repository's gitdir?", 'answer': 'The comment explains that this is done to avoid touching the disk. Avoiding disk access helps keep the individual fuzz-test cases as fast as possible, which is important for efficient fuzzing.'}, {'question': 'What is the purpose of the `cmd__scrap_cache_tree` test tool command?', 'answer': "The `cmd__scrap_cache_tree` command is a test tool that removes the cache tree from the current repository's index file. It reads the index, frees the in-memory `cache_tree` structure, sets the index's `cache_tree` pointer to NULL, and then writes the index back to disk. This is useful for testing scenarios where you want to ensure index operations behave correctly without relying on the cache tree optimization, or to simulate a state where the cache tree needs to be rebuilt."}, {'question': 'How does the `ll_xdl_merge` fu

Kérdések kimentése

In [134]:
import json

with open("qa_all_pairs.json", "w", encoding="utf-8") as f:
    json.dump(repo_qa_data, f, ensure_ascii=False, indent=2)


Kérdések betöltése

In [None]:
with open("qa_all_pairs.json", "r", encoding="utf-8") as f:
    qa_qa_data = json.load(f)



In [None]:
#A fájlok és issue-k feldolgozása, embeddingek generálása és indexelés
def generate_everything(repo_name, github_token):
    files = clone_repo(github_token, repo_name)
    issues = fetch_issues(github_token, repo_name)
    all_documents = files + issues
    documents = preprocess_files(all_documents)
    index, doc_embeddings_np, documents = create_embeddings(documents)
    return index, documents

In [None]:
#kérdés-válasz párok +indexek és dokumentumok összevonása
repo_data_with_index_documents = []

for repo_name, qa_pairs in repo_qa_data.items():
    index, documents = generate_everything(repo_name, github_token)
    for qa in qa_pairs:
        repo_data_with_index_documents .append({
            "question": qa["question"],
            "answer": qa["answer"],
            "index": index,
            "documents": documents
        })

In [234]:
len(repo_data_with_index_documents)

200

indexelések kimentése

In [None]:
import json
import faiss

repo_data_serializable = []
index_map = {}

for i, item in enumerate(repo_data_with_index_documents):
    index_id = i // 20
    index_filename = f"faiss_index_{index_id}.index"
    
    if index_id not in index_map:
        faiss.write_index(item["index"], index_filename)
        index_map[index_id] = index_filename

    item_copy = {
        "question": item["question"],
        "answer": item["answer"],
        "documents": item["documents"],
        "index_file": index_filename
    }
    repo_data_serializable.append(item_copy)

with open("repo_data_serializable.json", "w", encoding="utf-8") as f:
    json.dump(repo_data_serializable, f, ensure_ascii=False, indent=2)


indexelések betöltése

In [None]:
import json
import faiss

with open("repo_data_serializable.json", "r", encoding="utf-8") as f:
    repo_data_serializable = json.load(f)

index_cache = {}

repo_data_reconstructed = []
for item in repo_data_serializable:
    index_file = item["index_file"]
    if index_file not in index_cache:
        index_cache[index_file] = faiss.read_index(index_file)
    
    reconstructed_item = {
        "question": item["question"],
        "answer": item["answer"],
        "documents": item["documents"],
        "index": index_cache[index_file]
    }
    repo_data_reconstructed.append(reconstructed_item)


In [240]:
#train és test adatok szétválasztása
import random
random.seed(12) 
shuffled = repo_data_with_index_documents[:]
random.shuffle(shuffled)
split_point = int(len(repo_data_with_index_documents) * 0.9)
repo_data_train = shuffled[:split_point]
repo_data_test = shuffled[split_point:]

In [241]:
len(repo_data_train)

180

## Adathalmaz átalakítása a fine tuning-hoz

In [142]:
NUM_DISTRACTORS = 4 #Distractor dokumentumok száma
PERCENT_D_STAR_INCLUDED = 0.8 #D* arány

In [143]:
from langchain_mistralai import ChatMistralAI
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

llm_mistral = ChatMistralAI(model_name="mistral-small", temperature=0)

In [144]:
#A D* megtalálásához szükséges segéd függvény
#Egy llm-et kérdez meg, hogy a kérdés válasz alapján melyik dokumentumban van a válasz

from langchain_core.messages import HumanMessage

llm_mistral = ChatMistralAI(model_name="mistral-small", temperature=0)

def call_llm_to_find_doc(question, answer, candidate_docs):
    prompt = f"""You are an AI assistant helping with document understanding.

Question: "{question}"
Answer: "{answer}"

Here are some candidate document chunks (labeled D1, D2, ..., Dn):

"""
    for i, doc in enumerate(candidate_docs):
        prompt += f"D{i+1}: {doc}\n\n"

    prompt += """
Please analyze the documents above and decide which document (D1, D2, ..., Dn) most likely contains the answer to the question. 

If none of the documents are relevant or contain the answer, reply with "None".
Otherwise, reply with the label of the best matching document, such as "D2".
Only respond with "None" or a label like "D3"—no explanation.
"""

    response = llm_gemini.invoke([HumanMessage(content=prompt)])
    reply = response.content.strip()
    return reply

In [164]:
#A D* megtalálásához szükséges fő függvény
#Megkeresi a dokumentumot a kérdés-válasz alapján, amiben a válasz található
def find_D_star_document(question, answer, all_docs, embeddings, index):

    
    candidate_docs = retrieve_relevant_document(question, index, all_docs, embeddings, k=10)

    if not candidate_docs:
        print("Nem találtunk releváns dokumentumot.")
        return None

    best_doc_label = call_llm_to_find_doc(question, answer, candidate_docs)

    best_doc_label = best_doc_label.strip('"')
    if best_doc_label.lower() == "none":
        return None

    if best_doc_label.startswith("D") and best_doc_label[1:].isdigit():
        doc_index = int(best_doc_label[1:]) - 1
        if 0 <= doc_index < len(candidate_docs):
            return candidate_docs[doc_index]

    print("A LLM válasza nem értelmezhető:", best_doc_label)
    return None

In [163]:
def select_distractor_documents(question, D_star_doc, all_docs, embeddings, index, k):
    retrieved_docs = retrieve_relevant_document(question, index, all_docs, embeddings, k=k*2 + 1) #Több választ kér, hogy legyen benne Di is

    distractors = []
    for doc in retrieved_docs:
        if doc != D_star_doc and doc not in distractors: #Ellenőrzi, hogy ne legyen benne a D* dokumentum
            distractors.append(doc)
        if len(distractors) == k:
            break

    #Ha nincs még elég distractor, akkor random választ a többi dokumentumból
    attempts = 0
    max_attempts = k * 5
    while len(distractors) < k and attempts < max_attempts:
        random_doc = random.choice(all_docs)
        if random_doc != D_star_doc and random_doc not in distractors:
            distractors.append(random_doc)
        attempts += 1

    return distractors

In [147]:
#Chain-of-Thought (CoT) generálás
def generate_cot_answer(question, D_star_doc, original_answer):
    prompt = f"""Given the Question, the Context (containing the golden document with the answer), and the original concise Answer, provide a detailed reasoning process (Chain-of-Thought) that explains step-by-step how to arrive at the Answer using *only* the provided Context.

Crucially, you MUST cite the exact sentences or phrases from the Context that support your reasoning. Enclose these citations within ##begin_quote## and ##end_quote## tags. Do not add any information not present in the Context.

Finally, state the concise Answer clearly. Format your response *exactly* as:
##Reason: [Your detailed reasoning with citations like ##begin_quote## text from context ##end_quote##.]
##Answer: [The final concise answer, matching the original answer provided]

Question: {question}
Context: {D_star_doc}
Original Answer: {original_answer}
"""

    response = llm_mistral.invoke([HumanMessage(content=prompt)])
    reply = response.content.strip()
    return reply

In [247]:
#RAFT-hoz megfelelő adatok generálása
raft_training_data = []
skipped_count = 0
counter = 0

for item in repo_data_train:
    question = item['question']
    original_answer = item['answer']
    index = item['index']
    all_document_chunks = item['documents']

    #Először meg kell találni a D* dokumentumot
    D_star_doc = find_D_star_document(question, original_answer, all_document_chunks, OpenAIEmbeddings(), index)
    if not D_star_doc:
        print(f"A '{question} ' kérdés kihagyása. D* dokumentum nem található.")
        skipped_count += 1
        continue

    #CoT válasz generálása
    cot_answer_str = generate_cot_answer(question, D_star_doc, original_answer)
    if not cot_answer_str:
        print(f"A '{question} ' kérdés kihagyása.  CoT generálás sikertelen.")
        skipped_count += 1
        continue

    #Distractor dokumentumok kiválasztása
    distractor_docs = select_distractor_documents(question, D_star_doc, all_document_chunks, OpenAIEmbeddings(), index, k=NUM_DISTRACTORS)

    #A D* benne legyen-e a válaszban
    include_D_star = random.random() < PERCENT_D_STAR_INCLUDED

    context_docs_for_instance = []
    if include_D_star:
        #D* dokumentum benne van a válaszban
        context_docs_for_instance.append(D_star_doc)
        context_docs_for_instance.extend(distractor_docs)
    else:
        #Csak a distractor dokumentumokat tartalmazza
        context_docs_for_instance.extend(distractor_docs)


    random.shuffle(context_docs_for_instance) #a dokumentumok megkeverése

    context_string = "\n\n".join(context_docs_for_instance) #A kontexus létrehozása

    #fine-tuning-hez megfelelő formátum létrehozása 
    formatted_input = f"Question: {question}\n\nContext:\n{context_string}"
    formatted_output = cot_answer_str #CoT válasz

    raft_training_data.append({
        "input": formatted_input,
        "output": formatted_output
    })

    print(f"\nGenerated data for question: {question}, processed {counter} questions.")
    time.sleep(10)
    counter += 1
    if counter % 10 == 0:
        time.sleep(60)


Generated data for question: How does the `mypy` job execute the Mypy checks using `uv` and `tox`?, processed 0 questions.

Generated data for question: What is the purpose of the `encode_85` and `decode_85` functions declared in `base85.h`?, processed 1 questions.

Generated data for question: Why does `sndcpy` fail to install on Android 5.0 with the error `INSTALL_FAILED_OLDER_SDK`?, processed 2 questions.

Generated data for question: What is the purpose of the `CScriptNum10` class in the `scriptnum_tests`?, processed 3 questions.

Generated data for question: Why does this benchmark use `pydantic.TypeAdapter(list[Person])` to validate and dump data?, processed 4 questions.

Generated data for question: When using `cookiecutter`, how does the `repository.determine_repo_dir` function handle templates that are already present in the local cache directory?, processed 5 questions.

Generated data for question: Why is adding torrent verification to the binary verification script necessa

In [248]:
 len(raft_training_data)

170

In [249]:
with open("raft_training_data.json", "w", encoding="utf-8") as f:
    json.dump(raft_training_data, f, ensure_ascii=False, indent=2)

In [251]:
with open("raft_training_data.json", "r", encoding="utf-8") as f:
    raft_training_data_loaded = json.load(f)

In [None]:
raft_training_data_loaded

## Fine-tuning

In [252]:
import json
import time
from openai import OpenAI
client = OpenAI()

openai_formatted_filename = "openai_formatted_training_data.jsonl"
base_model = "gpt-3.5-turbo-1106"


#Konverzió OpenAI chat formátumra

converted_count = 0
with open(openai_formatted_filename, 'w', encoding='utf-8') as outfile:
    for raft_example in raft_training_data_loaded:
        
        user_content = raft_example.get("input")
        assistant_content = raft_example.get("output")

        if not user_content or not assistant_content:
            print(f" Az 'input' vagy 'output' üres: {raft_example}")
            continue

        openai_message = {
            "messages": [
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}
            ]
        }

        outfile.write(json.dumps(openai_message) + '\n')
        converted_count += 1

print(f"Konvertálva {converted_count}")

Konvertálva 170


### Openai fine-tuning

In [276]:
#Fájl feltöltése fine-tuninghoz

with open(openai_formatted_filename, "rb") as f:
    training_file = client.files.create(
        file=f,
        purpose="fine-tune"
    )
training_file_id = training_file.id

In [277]:
training_file_id

'file-XEMqQQJDh3zWmGYsewtHAv'

In [278]:
#Fine-Tuning Job

job = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model=base_model
)


In [285]:
#A fine tuning folyamatának figyelése
job_id = job.id
job_status = client.fine_tuning.jobs.retrieve(job_id)
status = job_status.status
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Job Status: {status}")

[2025-05-04 12:49:56] Job Status: succeeded


In [None]:
fine_tuned_model_id = job_status.fine_tuned_model

In [254]:
fine_tuned_model_id_openai= "ft:gpt-3.5-turbo-1106:velkey::BTRAj6YH"

### Mistral fine-tuning

In [253]:
fine_tuned_model_id_mistral = "ft:mistral-large-latest:5a036207:20250505:bcb27770"

## Fine-tuning + RAG

In [255]:

llm_raft = ChatMistralAI(model_name=fine_tuned_model_id_mistral, temperature=0)


template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain_raft = prompt | llm_raft

chain_og = prompt | llm_mistral

In [256]:
#Tesztelés egy kérdésen

sample_query = repo_data_test[0]['question']
print(f" Kérdés: {sample_query}")

index = repo_data_test[0]['index']
documents = repo_data_test[0]['documents']

sample_relevant_docs = retrieve_relevant_document(sample_query, index, documents, OpenAIEmbeddings())
sample_context_str = "\n\n".join(sample_relevant_docs)

response_raft = chain_raft.invoke({
    "context": sample_context_str,
    "question": sample_query
})


print(response_raft.content)

 Kérdés: Why does the `fs::path` wrapper disallow using `std::string` for path construction and conversion?
The `fs::path` wrapper disallows using `std::string` for path construction and conversion to avoid locale-dependent decoding and encoding on Windows, which can lead to unsafe and unpredictable behavior.


## RAG Fusion

4 kérdést gyárt a megadott kérdésből, azokból kapott válaszokból generálja a végső választ

In [None]:
from langchain.prompts import ChatPromptTemplate

template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [98]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatMistralAI(model_name="mistral-small", temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [257]:
from langchain.load import dumps, loads

#eredmények rankolása
def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    fused_scores = {}

    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    
    time.sleep(5)
    return reranked_results



## HyDE

A nyelvi modell először generál egy lehetséges válaszdokumentumot a kérdés alapján. Ezt a feltételezett válaszdokumentumot átdolgozza embeddinggé.

Ezzel az embeddinggel keres dokumentumokat egy tudásbázisban.

A lekért dokumentumokból a modell végül valódi választ generál.

In [258]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""
prompt_hyde = ChatPromptTemplate.from_template(template)

generate_docs_for_retrieval = (
    prompt_hyde | ChatMistralAI(temperature=0) | StrOutputParser() 
)

## Kiértékelés RAGAs-sal a teszthalmazon

In [326]:
from datasets import Dataset

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

from operator import itemgetter
from langchain_core.runnables import RunnableLambda

def evaluate_test_data(llm, retriever_type = "embedding", rag_fusion = False, hyde = False):
    questions = [item["question"] for item in repo_data_test]
    ground_truths = [item["answer"] for item in repo_data_test]
    
    answers = []
    contexts = []

    for query in questions:

        index = [item["index"] for item in repo_data_test if item["question"] == query][0]
        documents = [item["documents"] for item in repo_data_test if item["question"] == query][0]
        
        #context megszerzése a választott retriever-től függően
        context =[]
        if retriever_type == "bm25":
            bm25, tokenized_docs = build_bm25_index(documents)
            context = retrieve_relevant_document_bm25(query, bm25, documents, tokenized_docs)

        elif retriever_type == "hybrid":
            bm25, tokenized_docs = build_bm25_index(documents)
            context = retrieve_relevant_document_hybrid(query, bm25, tokenized_docs, index, OpenAIEmbeddings(), documents)
        else:
            context = retrieve_relevant_document(query, index, documents, OpenAIEmbeddings(), 10)

        #Ha a HYDE-t használjuk
        if hyde:
            retriever = RunnableLambda(lambda query: retrieve_relevant_document(query, index, documents, OpenAIEmbeddings(), 10))
            retrieval_chain = generate_docs_for_retrieval | retriever
            retrieved = retrieval_chain.invoke({"question":query})
            context = retrieved

        template = """Answer the following question based on this context:

            {context}

            Question: {question}
            """

        prompt = ChatPromptTemplate.from_template(template)

        #Ha a RAG-fusion-t használjuk
        if rag_fusion:

            retriever = RunnableLambda(lambda query: retrieve_relevant_document(query, index, documents, OpenAIEmbeddings(), 10))

            retrieval_chain_rag_fusion = (
                generate_queries
                | retriever.map()
                | reciprocal_rank_fusion
            )

            final_rag_chain = (
                {"context": retrieval_chain_rag_fusion, 
                "question": itemgetter("question")} 
                | prompt
                | llm
                | StrOutputParser()
            )
        
            answers.append(final_rag_chain.invoke({"question":query}))
        else:
            chain = prompt | llm | StrOutputParser()

            content = chain.invoke({"context": context,"question":query})
            answers.append(content)
                
        
        contexts.append(contexts)

        time.sleep(1)


    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "reference": ground_truths
    }

    dataset = Dataset.from_dict(data)

    result = evaluate(
        dataset = dataset, 
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
        
    )

    df = result.to_pandas()

    return df

In [327]:
def show_evaluation_results(llm, retriever_type = "embedding", rag_fusion = False, hyde = False):
    df = evaluate_test_data(llm, retriever_type, rag_fusion, hyde)

    context_prediction = df["context_precision"].mean()
    context_recall = df["context_recall"].mean()
    faithfulness = df["faithfulness"].mean()
    answer_relevancy = df["answer_relevancy"].mean()

    print(f"Context Precision: {context_prediction:.4f}")
    print(f"Context Recall: {context_recall:.4f}")
    print(f"Faithfulness: {faithfulness:.4f}")
    print(f"Answer Relevancy: {answer_relevancy:.4f}")

    return df

In [None]:
def calc_evaluation_results(llm, retriever_type = "embedding", rag_fusion = False, hyde = False):
    df = evaluate_test_data(llm, retriever_type, rag_fusion, hyde)

    context_prediction = df["context_precision"].mean()
    context_recall = df["context_recall"].mean()
    faithfulness = df["faithfulness"].mean()
    answer_relevancy = df["answer_relevancy"].mean()

    return context_prediction, context_recall, faithfulness, answer_relevancy

In [None]:
import pandas as pd
from itertools import product

llm_values = [llm_mistral, llm_raft_mistral, llm_raft_openai]
retriever_types = ["embedding", "bm25", "hybrid"]
rag_fusion_options = [True, False]
hyde_options = [True, False]


combinations = list(product(llm_values, retriever_types, rag_fusion_options, hyde_options))

results = []
for llm, retriever, fusion, hyde in combinations:
    ctx_pred, ctx_recall, faithful, answer_rel = calc_evaluation_results(llm, retriever, fusion, hyde)
    results.append({
        "LLM": llm,
        "Retriever": retriever,
        "RAG Fusion": fusion,
        "HyDE": hyde,
        "Context Prediction": ctx_pred,
        "Context Recall": ctx_recall,
        "Faithfulness": faithful,
        "Answer Relevancy": answer_rel
    })




Evaluating: 100%|██████████| 80/80 [00:54<00:00,  1.46it/s]
Evaluating: 100%|██████████| 80/80 [00:59<00:00,  1.34it/s]
Evaluating: 100%|██████████| 80/80 [01:26<00:00,  1.08s/it]
Evaluating: 100%|██████████| 80/80 [00:55<00:00,  1.44it/s]
Evaluating: 100%|██████████| 80/80 [00:56<00:00,  1.41it/s]
Evaluating: 100%|██████████| 80/80 [00:47<00:00,  1.68it/s]
Evaluating:  35%|███▌      | 28/80 [00:23<00:44,  1.16it/s]Exception raised in Job[18]: OutputParserException(Invalid json output: {"statements": [{"statement": "The likely value of `project_slug` after the template expression is processed would be \"its-slugified-foobar\".","reason": "The context provides a template expression for `project_slug` that uses the slugify filter on 'It\'s slugified Foobar'. However, the exact output of the slugify filter is not specified, so we cannot definitively conclude that it would be 'its-slugified-foobar'.","verdict": 0},{"statement": "The assumption is that the slugify filter converts the input 

In [None]:
df = pd.DataFrame(results)

In [113]:
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0,LLM,Retriever,RAG Fusion,HyDE,Context Prediction,Context Recall,Faithfulness,Answer Relevancy
0,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},embedding,True,True,0.615397,0.675,0.574074,0.77685
1,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},embedding,True,False,0.639254,0.558333,0.665139,0.766164
2,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},embedding,False,True,0.576892,0.725,0.867447,0.71535
3,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},embedding,False,False,0.641043,0.7,0.737698,0.773911
4,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},bm25,True,True,0.569474,0.7,0.544032,0.718299
5,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},bm25,True,False,0.361042,0.233333,0.260615,0.762268
6,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},bm25,False,True,0.617678,0.675,0.773844,0.810178
7,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},bm25,False,False,0.361042,0.233333,0.656508,0.567507
8,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},hybrid,True,True,0.578968,0.725,0.587954,0.764314
9,client=<httpx.Client object at 0x0000017FB38F3040> async_client=<httpx.AsyncClient object at 0x0000018013516AD0> mistral_api_key=SecretStr('**********') endpoint='https://api.mistral.ai/v1' temperature=0.0 model_kwargs={},hybrid,True,False,0.464167,0.383333,0.352163,0.672569
