In [None]:
pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [None]:

pip install -U langchain langchain-openai

In [None]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'api_key' #kicseréltem, hogy a githubon ne legyen benne a kulcs

In [None]:
os.environ['OPENAI_API_KEY'] = 'api_key'

In [None]:
pip install PyGithub

In [None]:
pip install faiss-cpu


In [None]:
pip install faiss-cpu

## Github repo feldolgozása és indexelés

In [9]:
import requests
from github import Github
from bs4 import BeautifulSoup
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
import faiss
import numpy as np

In [10]:
#GitHub Repoból kiszedi a szükséges dolgokat (kód, dokumentáció)
def clone_repo(github_token, repo_name):
    g = Github(github_token)
    repo = g.get_repo(repo_name)
    
    contents = repo.get_contents("")
    
    files = []
    while contents:
        file_content = contents.pop(0)
        if file_content.type == "dir":
            contents.extend(repo.get_contents(file_content.path))
        else:
            if file_content.name.endswith(('.md', '.py', '.txt', '.json', '.yml', '.c', '.h', '.cpp', '.hpp')): # Ezeket a fájlokat fogadja el egyenlőre
                file_data = {
                    "file_name": file_content.name,
                    "file_content": requests.get(file_content.download_url).text
                }
                files.append(file_data)
    
    return files

In [11]:
#A Issue-k kiszedése a repo-ból
def fetch_issues(github_token, repo_name):
    g = Github(github_token)
    repo = g.get_repo(repo_name)
    
    issues = []
    for issue in repo.get_issues(state='open'):
        issue_data = {
            "file_name": f"issue_{issue.number}",
            "file_content": f"Title: {issue.title}\nDescription: {issue.body}"
        }
        issues.append(issue_data)
    
    return issues

In [12]:
#A fájlok kisebb darabokra bontása
def preprocess_files_old(files):
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=0)
    documents = []
    
    for file in files:
        content = file['file_content']
        
        #Markdown fájlok esetén HTML kód eltávolítása
        if file['file_name'].endswith(('.md', '.html')):
            soup = BeautifulSoup(content, 'html.parser')
            content = soup.get_text()
        
        chunks = text_splitter.split_text(content)
        
        for chunk in chunks:
            documents.append(chunk)
    
    return documents

In [13]:
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

EXTENSION_LANGUAGE_MAP = {
    '.py': Language.PYTHON,
    '.cpp': Language.CPP,
    '.c': Language.C,
    '.cs': Language.CSHARP,
    '.md': Language.MARKDOWN,
    '.html': Language.HTML,
}

def get_language_from_filename(filename):
    for ext, lang in EXTENSION_LANGUAGE_MAP.items():
        if filename.endswith(ext):
            return lang
    return None

def preprocess_files(files):
    documents = []
    
    for file in files:
        filename = file['file_name']
        content = file['file_content']

        if filename.endswith(('.md', '.html')):
            soup = BeautifulSoup(content, 'html.parser')
            content = soup.get_text()

        lang = get_language_from_filename(filename)

        if lang:
            splitter = RecursiveCharacterTextSplitter.from_language(
                language=lang, chunk_size=500, chunk_overlap=0
            )
        else:
            splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)

        docs = splitter.create_documents([content])
        documents.extend(doc.page_content for doc in docs)

    return documents


In [14]:
#Embeddingek generálása  OpenAI embeddings-el és FAISS indexeléssel
def create_embeddings(documents):
    embeddings = OpenAIEmbeddings()
    
    doc_embeddings = embeddings.embed_documents(documents)
    doc_embeddings_np = np.array(doc_embeddings)
    
    index = faiss.IndexFlatL2(doc_embeddings_np.shape[1])
    index.add(doc_embeddings_np)
    
    return index, doc_embeddings_np, documents

In [None]:
#Egy konkrét repora alkalmazva a fenti funkciókat

github_token = 'api_key'
repo_name = "pydantic/pydantic"
    
files = clone_repo(github_token, repo_name)
    
issues = fetch_issues(github_token, repo_name)
    
all_documents = files + issues

In [16]:
documents = preprocess_files(all_documents)
    
index, doc_embeddings_np, documents = create_embeddings(documents)

  embeddings = OpenAIEmbeddings()


## Retrieval

In [17]:
#A kérdés (Query) feldolgozása és a legrelevánsabb dokumentumok visszaadása
def retrieve_relevant_document(query, index, documents, embeddings, k=5):
    query_embedding = embeddings.embed_query(query)
    query_embedding_np = np.array(query_embedding).reshape(1, -1)
    D, I = index.search(query_embedding_np, k)
    
    relevant_docs = [documents[i] for i in I[0]]
    return relevant_docs

In [18]:
#Példa kérdés
query = "Does this program support Python 3.14?"
relevant_docs = retrieve_relevant_document(query, index, documents, OpenAIEmbeddings())
    
print(f"5 válasz legrevelánsabb dokumentum:")
for doc in relevant_docs:
        print(f"- {doc}")

5 válasz legrevelánsabb dokumentum:
- Title: Add support for Python 3.14
Description: First 3.14 beta release is [planned on 2025-05-06](https://peps.python.org/pep-0745/#release-schedule) and PEP 649/749 is almost fully implemented.

Considering the significant changes it provides to the runtime evaluation of type hints, we should add support to 3.14 and report any bugs/issues.
- python version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
                     platform: Linux-5.15.133.1-microsoft-standard-WSL2-x86_64-with-glibc2.35
             related packages: mypy-1.8.0 typing_extensions-4.9.0
                       commit: unknown
```
- python version: 3.12.7 (main, Oct  1 2024, 11:15:50) [GCC 14.2.1 20240910]
                     platform: Linux-6.11.5-arch1-1-x86_64-with-glibc2.40
             related packages: typing_extensions-4.12.2 fastapi-0.115.0 mypy-1.13.0 pydantic-settings-2.6.1
                       commit: unknown
```
- python version: 3.10.13 (main, Sep 11 2023

## Generation

In [19]:
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

In [20]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [21]:
chain = prompt | llm

In [22]:
chain.invoke({"context":relevant_docs,"question":"Does this program support Python 3.14?"})

AIMessage(content='Based on the context provided, the program does not currently support Python 3.14.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 533, 'total_tokens': 552, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-BM2DKRgAVmnj4ngloqv1BNzqW7DX2', 'finish_reason': 'stop', 'logprobs': None}, id='run-dca61ff7-0238-41a0-89ec-73154f63f3cf-0', usage_metadata={'input_tokens': 533, 'output_tokens': 19, 'total_tokens': 552, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Kiértékelés RAGAs-sal

In [None]:
pip install ragas

In [24]:
from datasets import Dataset

questions = ["Does this program support Python 3.14?", 
             "What is the purpose of the REDIRECT_TO_V1 dictionary?",
             "What is the purpose of the is_root_model function",
            ]
ground_truths = ["No, this program supports Python 3.14. It is only planned for the future.",
                "The REDIRECT_TO_V1 dictionary automatically redirects certain utility functions from the main Pydantic namespace to their v1 implementations, generating a warning message that informs users about the change while maintaining backward compatibility. This helps manage the transition between major versions by ensuring old code continues to work while encouraging updates to newer patterns.",
                "The is_root_model function determines whether a given TypeInfo represents a RootModel subclass or the RootModel class itself. It does this by checking if the type has a base class matching the ROOT_MODEL_FULLNAME constant, which is defined as 'pydantic.root_model.RootModel'."]
answers = []
contexts = []

for query in questions:
  context = retrieve_relevant_document(query, index, documents, OpenAIEmbeddings())
  answers.append(chain.invoke({"context": context,"question":query}).content)
  contexts.append(context)

data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "reference": ground_truths
}

dataset = Dataset.from_dict(data)

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
data

{'question': ['Does this program support Python 3.14?',
  'What is the purpose of the REDIRECT_TO_V1 dictionary?',
  'What is the purpose of the is_root_model function'],
 'answer': ['Based on the provided context, the program does not currently support Python 3.14.',
  "The purpose of the REDIRECT_TO_V1 dictionary is to redirect certain objects from the 'pydantic.utils' module to the 'pydantic.v1.utils' module.",
  "The purpose of the `is_root_model` function is to convert the `RootModel` to a dictionary with the key `'root'`."],
 'contexts': [['Title: Add support for Python 3.14\nDescription: First 3.14 beta release is [planned on 2025-05-06](https://peps.python.org/pep-0745/#release-schedule) and PEP 649/749 is almost fully implemented.\n\nConsidering the significant changes it provides to the runtime evaluation of type hints, we should add support to 3.14 and report any bugs/issues.',
   'python version: 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]\r\n                     pla

In [26]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()

Evaluating: 100%|██████████| 12/12 [00:10<00:00,  1.17it/s]


In [27]:
df

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall,faithfulness,answer_relevancy
0,Does this program support Python 3.14?,[Title: Add support for Python 3.14\nDescripti...,"Based on the provided context, the program doe...","No, this program supports Python 3.14. It is o...",1.0,0.5,0.5,0.989978
1,What is the purpose of the REDIRECT_TO_V1 dict...,[REDIRECT_TO_V1 = {\n f'pydantic.utils:{obj...,The purpose of the REDIRECT_TO_V1 dictionary i...,The REDIRECT_TO_V1 dictionary automatically re...,1.0,0.0,1.0,1.0
2,What is the purpose of the is_root_model function,"[::: pydantic.root_model, ```\n\nThis trick is...",The purpose of the `is_root_model` function is...,The is_root_model function determines whether ...,0.0,0.0,0.5,0.963927


## RAFT

## Adathalmaz készítése a RAFT-hoz

In [68]:
repo_qa_data = {
    "pydantic/pydantic": [
        {"question": "Does this program support Python 3.14?", "answer": "No, this program supports Python 3.14. It is only planned for the future."},
        {"question": "What is the purpose of the REDIRECT_TO_V1 dictionary?", "answer": "The REDIRECT_TO_V1 dictionary automatically redirects certain utility functions from the main Pydantic namespace to their v1 implementations, generating a warning message that informs users about the change while maintaining backward compatibility. This helps manage the transition between major versions by ensuring old code continues to work while encouraging updates to newer patterns."},
        {"question": "What is the purpose of the is_root_model function?", "answer": "The is_root_model function determines whether a given TypeInfo represents a RootModel subclass or the RootModel class itself. It does this by checking if the type has a base class matching the ROOT_MODEL_FULLNAME constant, which is defined as 'pydantic.root_model.RootModel'."},
        {"question": "What exactly does that refer to that we want reduce the dependence on callables to mutate JSON schemas. What exactly does this refer to, and why is it desirable based on the information provided?", "answer": "The desire to reduce this dependence stems from the description of the existing callable annotation construct as nebulous and the stated goal to perform simple dict updates when possible instead, suggesting the current callable-based approach is considered complex or unclear compared to more direct methods"},
        {"question": "The definition for JsonValue uses a complex structure involving Union, Annotated, Tag, and a callable Discriminator (_get_type_name). How does this combination specifically ensure that only JSON-compatible types (like lists, dicts with string keys, strings, numbers, booleans, None) and nested structures of these types are validated?", "answer": "JsonValue uses a Union of basic JSON-compatible types (list, dict, str, int, float, bool, None), where each type is Annotated with a Tag (its type name). A Discriminator function (_get_type_name) identifies the input's type name. Pydantic matches this name to the Tag to select the correct validator. The list and dict types are defined recursively (list['JsonValue'], dict[str, 'JsonValue']) to handle nested structures. If the input type doesn't match any tag, validation fails. An internal annotation (_AllowAnyJson) also handles parsing if the input is a JSON string."},
    ],
    "pallets/flask": [
        {"question": "What is Flask", "answer": "Flask is a lightweight WSGI web application framework. It is designed to make getting started quick and easy, with the ability to scale up to complex applications. It began as a simple wrapper around Werkzeug and Jinja, and has become one of the most popular Python web application frameworks."},
        {"question": "The PathDispatcher example uses the shift_path_info function from wsgiref.util after identifying the target application but before calling it. What specific function does shift_path_info perform in this context?", "answer": "The document shows the PathDispatcher calling shift_path_info(environ) after retrieving the application (app) based on the path prefix and before calling app(environ, start_response). However, the document does not explicitly explain what the shift_path_info function does or why it is necessary in this specific dispatching scenario. It only demonstrates its usage within the code example."},
        {"question": "Is there a security vulnerability for Jinja2 GHSA-cpwx-vrp4-4pq7?", "answer": "No, the issue was closed"},
        {"question": "The methods do_teardown_request and do_teardown_appcontext define their exc parameter with a default value of _sentinel (imported from .sansio.scaffold). Why does the code use this specific sentinel object as the default instead of None?", "answer": "he code shows that if the exc parameter is the _sentinel object upon entering the do_teardown_request or do_teardown_appcontext methods, it retrieves the current exception information using sys.exc_info()[1]. This allows the functions to detect if an exception occurred during the request/app context lifecycle even if no exception was explicitly passed to the teardown function. However, the code itself does not explicitly explain the rationale for choosing the _sentinel object over using None as the default to distinguish between no exception passed and potentially None passed explicitly."},
        {"question": "The flash function contains a comment explaining that the original implementation using session.setdefault('_flashes', []).append(...) had issues. what was this issue, and how does the current implementation address it?", "answer": "According to the comment within the flash function, the original implementation assumed that modifying the list obtained via session.setdefault() would always update the session object itself. This assumption is incorrect for session implementations that use external storage (where the session object and its stored data might not be the same in-memory object). The current implementation addresses this by explicitly retrieving the list using session.get(_flashes, []), appending the new message to this retrieved list, and then explicitly assigning the modified list back to the session via session[_flashes] = flashes, ensuring the change is correctly persisted regardless of the session backend."},
    ],
    "gandalfcode/gandalf": [
        {"question": "The RiemannSolver constructor accepts a zeroMassFlux boolean parameter, which is later used in the ExactRiemannSolver::ComputeFluxes method. According to the comments and code within this file, what is the specific purpose of this zeroMassFlux option, and how does the code modify the flux calculation when it is enabled?", "answer": "According to comments in ComputeFluxes, the zeroMassFlux boolean option is intended for use with the Meshless finite-mass scheme of Hopkins 2015, specifically when the computational face moves with the velocity of the star region (ustar). When this option is true, the code implements the zero mass flux condition by:Setting the normal velocity component (Wface[ivx]) of the state sampled at the interface (s=0) to zero. Modifying the input face velocity (vface) by adding the star region's velocity (u, which is ustar at s=0) projected onto the original coordinate axes (u*runit[k]). The file does not provide further details on the Hopkins scheme itself or the theoretical justification for these specific modifications beyond stating they achieve zero mass flux in that context."},
        {"question": "What is the purpose of the gamma parameter in the RiemannSolver?", "answer": "Adiabatic index"},
        {"question": "Based on the parameters read from the configuration file, how does the ProcessSphParameters function determine which specific SPH kernel class (like M4Kernel, QuinticKernel, GaussianKernel, or TabulatedKernel) should be instantiated for the sph object?", "answer": "The function selects the SPH kernel class by first checking the tabulated_kernel integer parameter, and if it's zero, it then uses the kernel string parameter to choose between M4Kernel, QuinticKernel, or GaussianKernel."},
        {"question": "The PlotCommand.processCommand method includes a section described as a Hack for deleting the old colorbar when overplot is false. According to the code and comments, why is this necessary, and what specific steps does it take to remove the old colorbar?", "answer": "According to the code and comments, this hack is performed when overplot is false to remove any existing colorbar associated with the axes (ax) before drawing a new plot. It works by retrieving the colorbar object (if it exists) from an internal plotting.axesimages dictionary, explicitly deleting the colorbar's axes from the figure (fig.delaxes(cbar.ax)), and then adjusting the geometry of the main plot's axes (ax.change_geometry(...)) presumably to reclaim the space. The code doesn't explicitly state why ax.clear() alone is insufficient for this."},
        {"question": "In the neighbour finding section, why does the code calculate the angle between the source-particle vector and the source-neighbour vector instead of just selecting the closest neighbour that is also closer to the source?", "answer": "The code calculates the angle to find the neighbour that lies closest to the direct line-of-sight path between the particle and the source, rather than just the nearest particle, to build a chain for subsequent photon absorption calculations along that path."},
    ]
}

In [69]:
def generate_everything(repo_name, github_token):
    files = clone_repo(github_token, repo_name)
    issues = fetch_issues(github_token, repo_name)
    all_documents = files + issues
    documents = preprocess_files(all_documents)
    index, doc_embeddings_np, documents = create_embeddings(documents)
    return index, documents

In [70]:
repo_data_with_index_documents = []

for repo_name, qa_pairs in repo_qa_data.items():
    index, documents = generate_everything(repo_name, github_token)
    for qa in qa_pairs:
        repo_data_with_index_documents .append({
            "question": qa["question"],
            "answer": qa["answer"],
            "index": index,
            "documents": documents
        })

In [102]:
#train és test adatok szétválasztása
import random
random.seed(12) 
shuffled = repo_data_with_index_documents[:]
random.shuffle(shuffled)
split_point = int(len(repo_data_with_index_documents) * 0.9)
repo_data_train = shuffled[:split_point]
repo_data_test = shuffled[split_point:]

## Adathalmaz átalakítása a fine tuning-hoz

In [104]:
NUM_DISTRACTORS = 4 #Distractor dokumentumok száma
PERCENT_D_STAR_INCLUDED = 0.8 #D* arány

In [105]:
#A D* megtalálásához szükséges segéd függvény
#Egy llm-et kérdez meg, hogy a kérdés válasz alapján melyik dokumentumban van a válasz

from langchain_core.messages import HumanMessage

llm = ChatOpenAI(model="gpt-4", temperature=0.2)

def call_llm_to_find_doc(question, answer, candidate_docs):
    prompt = f"""You are an AI assistant helping with document understanding.

Question: "{question}"
Answer: "{answer}"

Here are some candidate document chunks (labeled D1, D2, ..., Dn):

"""
    for i, doc in enumerate(candidate_docs):
        prompt += f"D{i+1}: {doc}\n\n"

    prompt += """
Please analyze the documents above and decide which document (D1, D2, ..., Dn) most likely contains the answer to the question. 

If none of the documents are relevant or contain the answer, reply with "None".
Otherwise, reply with the label of the best matching document, such as "D2".
Only respond with "None" or a label like "D3"—no explanation.
"""

    response = llm.invoke([HumanMessage(content=prompt)])
    reply = response.content.strip()
    return reply

In [None]:
#A D* megtalálásához szükséges fő függvény
#Megkeresi a dokumentumot a kérdés-válasz alapján, amiben a válasz található
def find_D_star_document(question, answer, all_docs, embeddings, index):
    candidate_docs = retrieve_relevant_document(question, index, all_docs, embeddings, k=10)

    if not candidate_docs:
        print("Nem találtunk releváns dokumentumot.")
        return None

    best_doc_label = call_llm_to_find_doc(question, answer, candidate_docs)

    best_doc_label = best_doc_label.strip('"')
    if best_doc_label.lower() == "none":
        return None

    if best_doc_label.startswith("D") and best_doc_label[1:].isdigit():
        doc_index = int(best_doc_label[1:]) - 1
        if 0 <= doc_index < len(candidate_docs):
            return candidate_docs[doc_index]

    print("A LLM válasza nem értelmezhető:", best_doc_label)
    return None

In [107]:
def select_distractor_documents(question, D_star_doc, all_docs, embeddings, index, k):
    retrieved_docs = retrieve_relevant_document(question, index, all_docs, embeddings, k=k*2 + 1) #Több választ kér, hogy legyen benne Di is

    distractors = []
    for doc in retrieved_docs:
        if doc != D_star_doc and doc not in distractors: #Ellenőrzi, hogy ne legyen benne a D* dokumentum
            distractors.append(doc)
        if len(distractors) == k:
            break

    #Ha nincs még elég distractor, akkor random választ a többi dokumentumból
    attempts = 0
    max_attempts = k * 5
    while len(distractors) < k and attempts < max_attempts:
        random_doc = random.choice(all_docs)
        if random_doc != D_star_doc and random_doc not in distractors:
            distractors.append(random_doc)
        attempts += 1

    return distractors

In [108]:
#Chain-of-Thought (CoT) generálás
def generate_cot_answer(question, D_star_doc, original_answer):
    prompt = f"""Given the Question, the Context (containing the golden document with the answer), and the original concise Answer, provide a detailed reasoning process (Chain-of-Thought) that explains step-by-step how to arrive at the Answer using *only* the provided Context.

Crucially, you MUST cite the exact sentences or phrases from the Context that support your reasoning. Enclose these citations within ##begin_quote## and ##end_quote## tags. Do not add any information not present in the Context.

Finally, state the concise Answer clearly. Format your response *exactly* as:
##Reason: [Your detailed reasoning with citations like ##begin_quote## text from context ##end_quote##.]
##Answer: [The final concise answer, matching the original answer provided]

Question: {question}
Context: {D_star_doc}
Original Answer: {original_answer}
"""

    response = llm.invoke([HumanMessage(content=prompt)])
    reply = response.content.strip()
    return reply

In [None]:
#RAFT-hoz megfelelő adatok generálása
raft_training_data = []
skipped_count = 0

for item in repo_data_train:
    question = item['question']
    original_answer = item['answer']
    index = item['index']
    all_document_chunks = item['documents']

    #Először meg kell találni a D* dokumentumot
    D_star_doc = find_D_star_document(question, original_answer, all_document_chunks, OpenAIEmbeddings(), index)
    if not D_star_doc:
        print(f"A '{question} ' kérdés kihagyása. D* dokumentum nem található.")
        skipped_count += 1
        continue

    #CoT válasz generálása
    cot_answer_str = generate_cot_answer(question, D_star_doc, original_answer)
    if not cot_answer_str:
        print(f"A '{question} ' kérdés kihagyása.  CoT generálás sikertelen.")
        skipped_count += 1
        continue

    #Distractor dokumentumok kiválasztása
    distractor_docs = select_distractor_documents(question, D_star_doc, all_document_chunks, OpenAIEmbeddings(), index, k=NUM_DISTRACTORS)

    #A D* benne legyen-e a válaszban egy adott elemnél
    include_D_star = random.random() < PERCENT_D_STAR_INCLUDED

    context_docs_for_instance = []
    if include_D_star:
        #D* dokumentum benne van a válaszban
        context_docs_for_instance.append(D_star_doc)
        context_docs_for_instance.extend(distractor_docs)
    else:
        #Csak a distractor dokumentumokat tartalmazza
        context_docs_for_instance.extend(distractor_docs)


    random.shuffle(context_docs_for_instance) #a dokumentumok megkeverése

    context_string = "\n\n".join(context_docs_for_instance) #A kontexus létrehozása

    #fine-tuning-hez megfelelő formátum létrehozása 
    formatted_input = f"Question: {question}\n\nContext:\n{context_string}"
    formatted_output = cot_answer_str #CoT válasz

    raft_training_data.append({
        "input": formatted_input,
        "output": formatted_output
    })

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [110]:
 raft_training_data

[{'input': 'Question: In the neighbour finding section, why does the code calculate the angle between the source-particle vector and the source-neighbour vector instead of just selecting the closest neighbour that is also closer to the source?\n\nContext:\nfor (ii=0;ii<N;ii++) \t\t\t\t//Loop over all particles\r\n\t{\r\n\r\n\tfor (pp=0;pp<nos;pp++)\t\t//For each source\r\n\t\t{\r\n\t\t//For each particle that considers me a neighbour\r\n\t\tfor (jj=0;jj<ionisedsph[ii].neighstorcont;jj++)\r\n\t\t\t{\r\n\t\t\t//Work out the distances for both the test and candidate particle\r\n\t\t\tdistanceii=sqrt(pow(ionisedsph[ii].x-ionisedsph[sinkid[pp]].x,2.)+pow(ionisedsph[ii].y-ionisedsph[sinkid[pp]].y,2.)+pow(ionisedsph[ii].z-ionisedsph[sinkid[pp]].z,2.));\n\nif (angletest<ionisedsph[ii].angle[pp])\r\n\t\t\t\t\t{\r\n\t\t\t\t\tionisedsph[ii].angle[pp]=angletest;\t\t\t\t//Set new comparison angle to be that of the neighbour\r\n\t\t\t\t\tionisedsph[ii].neigh[pp]=ionisedsph[ii].neighstor[jj];\t//Writ

## Fine-tuning

In [111]:
import json
import time
from openai import OpenAI
client = OpenAI()

openai_formatted_filename = "openai_formatted_training_data.jsonl"
base_model = "gpt-3.5-turbo-0125"


#Konverzió OpenAI chat formátumra

converted_count = 0
with open(openai_formatted_filename, 'w', encoding='utf-8') as outfile:
    for raft_example in raft_training_data:
        
        user_content = raft_example.get("input")
        assistant_content = raft_example.get("output")

        if not user_content or not assistant_content:
            print(f" Az 'input' vagy 'output' üres: {raft_example}")
            continue

        openai_message = {
            "messages": [
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}
            ]
        }

        outfile.write(json.dumps(openai_message) + '\n')
        converted_count += 1

print(f"Konvertálva {converted_count}")

Konvertálva 8


In [83]:
#Fájl feltöltése fine-tuninghoz

with open(openai_formatted_filename, "rb") as f:
    training_file = client.files.create(
        file=f,
        purpose="fine-tune"
    )
training_file_id = training_file.id

In [84]:
#Fine-Tuning Job

job = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model=base_model
)


In [None]:
#A fine tuning folyamatának figyelése
job_id = job.id
job_status = client.fine_tuning.jobs.retrieve(job_id)
status = job_status.status
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Job Status: {status}")

In [96]:
fine_tuned_model_id = job_status.fine_tuned_model

## Fine-tuning + RAG

In [None]:

llm_raft = ChatOpenAI(model_name=fine_tuned_model_id, temperature=0)


template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain_raft = prompt | llm_raft

In [None]:
#Tesztekés egy kérdésen

sample_query = repo_data_test[0]['question']
print(f" Kérdés: {sample_query}")

sample_relevant_docs = retrieve_relevant_document(sample_query, index, documents, OpenAIEmbeddings())
sample_context_str = "\n\n".join(sample_relevant_docs)

response_raft = chain_raft.invoke({
    "context": sample_context_str,
    "question": sample_query
})


print(response_raft.content)

## Kiértékelés RAGAs-sal a teszthalmazon

In [None]:
def evaluate_test_data(chain):
    questions = [item["question"] for item in repo_data_test]
    ground_truths = [item["answer"] for item in repo_data_test]
    answers = []
    contexts = []

    for query in questions:
        context = retrieve_relevant_document(query, index, documents, OpenAIEmbeddings())
        answers.append(chain.invoke({"context": context,"question":query}).content)
        contexts.append(context)

    data = {
        "question": questions,
        "answer": answers,
        "contexts": contexts,
        "reference": ground_truths
    }

    dataset = Dataset.from_dict(data)

    result = evaluate(
        dataset = dataset, 
        metrics=[
            context_precision,
            context_recall,
            faithfulness,
            answer_relevancy,
        ],
    )

    df = result.to_pandas()

    return df

In [None]:
df_original = evaluate_test_data(chain)
df_raft = evaluate_test_data(chain_raft)

In [None]:
df_original

In [None]:
df_raft