<h1>This is a python notebook containing our implmementation of a RAG for code generation, completion, debugging and other code relatied "natural" language queries</h1>

In [2]:
!pip install chromadb sentence_transformers pandas bs4



You should consider upgrading via the 'c:\users\38641\documents\faks\5.letnik\2.semester\nlp\ul-fri-nlp-course-project-2024-2025-1-6-3-musketeers\venv\scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
import chromadb
from sentence_transformers import SentenceTransformer
import pandas as pd
import time
import uuid
from bs4 import BeautifulSoup

  from .autonotebook import tqdm as notebook_tqdm


Read the Stackoverflow questions

In [None]:
data = ['python_questions0.csv']
MAX_DOCS = 500
num_retrievers = 10
num_reranked_docs = 3
df = pd.DataFrame()
for d in data:
    df = pd.concat([df, pd.read_csv(d)], ignore_index=True)

    
df = df.loc[:min(len(df), MAX_DOCS-1), ["tags", "question_title", "question_body", "answer", "question_score"]]
total_docs = len(df)
print(f"Loaded {total_docs} questions")

Loaded 500 questions


Chunk the questions and prepare them to be embedded

In [3]:
chunks = []
min_code_block = 10

for ix, content in df.iterrows():
    answer = content.loc['answer']
    tags = content.loc["tags"]
    score = content.loc["question_score"]

    question_chunk = f"{content.loc['question_title']}\n{content.loc['question_body']}".lower()
    chunks.append({"chunk": question_chunk,
                   "metadata": {"tags": tags,
                                "score": score,
                                "question": True,
                                "code": False,
                                "answer": answer.lower()
                                }})
    
    answer_chunk = str(answer).lower()
    chunks.append({"chunk": answer_chunk,
                   "metadata": {"tags": tags,
                   "score": score,
                   "question": False,
                   "code": False}
                   })

    soup = BeautifulSoup(answer, 'html.parser')
    code_blocks = [code.get_text() for code in soup.find_all('code')]
    for block in code_blocks:
        if len(block) > min_code_block and '\n' in block.strip():
            chunks.append({"chunk": block.lower(),
                           "metadata": {"tags": tags,
                                        "score": score,
                                        "question": False,
                                        "code": True}})

chunks = pd.DataFrame(chunks)
total_chunks = len(chunks)
print(f"Prepared {total_chunks} chunks.")

Prepared 1412 chunks.


Initiate the embedder model and the vector database to store embeddings

In [4]:
# Initialize Chroma client
client = chromadb.PersistentClient(path="./test_db")

collection = client.get_or_create_collection(
    name="stackoverflow_demo",
    metadata={"hnsw:space": "cosine"}
)

In [5]:
# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')

Embed the chunks and save them into the database

In [6]:
def print_progress(current, total, start_time, operation="Processing"):
    elapsed = time.time() - start_time
    percent = current / total
    eta = (elapsed / current) * (total - current) if current > 0 else 0
    print(
        f"\r{operation}: {current}/{total} ({percent:.1%}) | "
        f"Elapsed: {elapsed:.1f}s | ETA: {eta:.1f}s",
        end="", flush=True
    )

In [7]:
BATCH_SIZE = 200
total_added = 0
start_time = time.time()

for batch_num in range(0, total_chunks, BATCH_SIZE):
    batch = chunks.iloc[batch_num:batch_num + BATCH_SIZE]
    
    documents = []
    metadatas = []
    ids = []
    embeddings = []
    
    for ix, row in batch.iterrows():
        chunk = row["chunk"]
        metadata = row["metadata"]
        documents.append(chunk)
        metadatas.append(metadata)
        ids.append(str(uuid.uuid4()))
        embeddings.append(model.encode(chunk).tolist())
    
    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids,
        embeddings=embeddings
    )
    total_added += len(documents)

    print_progress(min(batch_num + BATCH_SIZE, total_chunks), total_chunks, start_time)


print(f"\n\nSuccessfully added {total_added} documents")
print(f"Total documents in collection: {collection.count()}")
print(f"Total time: {time.time() - start_time:.2f} seconds")

Processing: 1412/1412 (100.0%) | Elapsed: 49.8s | ETA: 0.0s

Successfully added 1412 documents
Total documents in collection: 3824
Total time: 49.84 seconds


In [8]:
results = collection.get()
print(f"Total documents: {len(results['ids'])}")

# Inspect first few items
for i in range(min(3, len(results['ids']))):
    print(f"\nDocument {i+1}:")
    print(f"ID: {results['ids'][i]}")
    print(f"Content: {results['documents'][i][:200]}...")  # First 200 chars
    print(f"Metadata: {results['metadatas'][i]}")

Total documents: 3824

Document 1:
ID: 20eb6424-2233-47aa-9fe4-6a17e81056e4
Content: deleting dataframe row in pandas based on column value
<p>i have the following dataframe:</p>

<pre><code>             daysago  line_race rating        rw    wrating
 line_date                        ...
Metadata: {'question': True, 'answer': "<p>the given answer is correct nontheless as someone above said you can use <code>df.query('line_race != 0')</code> which depending on your problem is much faster. highly recommend.</p>", 'code': False, 'tags': 'python|pandas', 'score': 256}

Document 2:
ID: 2f563cda-e2fc-4134-b81f-18216853484e
Content: <p>the given answer is correct nontheless as someone above said you can use <code>df.query('line_race != 0')</code> which depending on your problem is much faster. highly recommend.</p>...
Metadata: {'tags': 'python|pandas', 'code': False, 'question': False, 'score': 256}

Document 3:
ID: 982cf52e-d6ee-4204-8c32-ed0525a41cb8
Content: deleting dataframe row in pand

Check the retrieval process

In [9]:
# Search for similar questions
query_text = "how to parse json in python"
query_embedding = model.encode(query_text.lower()).tolist()

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print("\nTop 3 similar questions:")
for i, (doc, meta) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"\nResult {i+1}:")
    print(f"Score: {1 - results['distances'][0][i]:.2f}")
    print(f"Content: {doc[:200]}...")
    print(f"Tags: {meta['tags']}")


Top 3 similar questions:

Result 1:
Score: 0.58
Content: import json
import pandas as pd
from pandas.io.json import json_normalize
import csv

source_file = '11april1.txt'
result_file = 'output.csv'


with open(source_file) as source:
    with open(result_f...
Tags: python|json|pandas

Result 2:
Score: 0.58
Content: import json
import pandas as pd
from pandas.io.json import json_normalize
import csv

source_file = '11april1.txt'
result_file = 'output.csv'


with open(source_file) as source:
    with open(result_f...
Tags: python|json|pandas

Result 3:
Score: 0.56
Content: import json

foo = {none: 7, 'bar': 8}
# {'bar': 8, none: 7}

foo_json = json.dumps(foo)
# '{"bar": 8, "null": 7}'

foo_prime = json.loads(foo_json)
# {'null': 7, 'bar': 8}

foo_sorted = json.dumps(fo...
Tags: python|json


### Adding a reranker

Compare the results received without reranker (above cell code) with the results gotten with a reranker (below).

First implementation of reranker is using two different cross encoder models MiniLM-L-6-v2 and bge-reranker-base

In [10]:
from sentence_transformers import CrossEncoder

# Search for similar questions and rerank them, use this example for testing
query_text = "how to parse json in python"
query_embedding = model.encode(query_text.lower()).tolist()

# use more (eg. 10-20) results for reranking
initial_results = collection.query(query_embeddings=[query_embedding], n_results=20) 

docs = initial_results['documents'][0]
distances = initial_results['distances'][0]

# prepare pairs for CrossEncoder
pairs = [(query_text, doc) for doc in docs]

# CrossEncoder MiniLM-L-6-v2 
reranker1 = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
scores1 = reranker1.predict(pairs)

# CrossEncoder bge-reranker-base
reranker2 = CrossEncoder('BAAI/bge-reranker-base')
scores2 = reranker2.predict(pairs)

# Sort documents based on reranked scores
def get_highest_scores(scores):
    list_of_documents_scores = list(zip(docs, scores))
    return sorted(list_of_documents_scores, key=lambda x: x[1], reverse=True)

scores_ranked_1 = get_highest_scores(scores1)
scores_ranked_2 = get_highest_scores(scores2)

# Check reranked results to our query
number_of_results = 1
print(f"Top {number_of_results} reranked results for MiniLM-L-6-v2:")
for i, (doc, score) in enumerate(scores_ranked_1[:number_of_results]):
    print('-' * 60)
    print(f"Result {i+1}:")
    print(f"Score: {score:.2f}")
    print(f"Content: {doc[:200]}...")
    print(f"Tags: {meta['tags']}")
    print(f"Answer: {meta.get('answer', 'N/A')}")
    print(f"Code Block: {'Yes' if meta.get('code', False) else 'No'}")

print(f"Top {number_of_results} reranked results for bge-reranker-base:")
for i, (doc, score) in enumerate(scores_ranked_2[:number_of_results]):
    print('-' * 60)
    print(f"Result {i+1}:")
    print(f"Score: {score:.2f}")
    print(f"Content: {doc[:200]}...")
    print(f"Tags: {meta['tags']}")
    print(f"Answer: {meta.get('answer', 'N/A')}")
    print(f"Code Block: {'Yes' if meta.get('code', False) else 'No'}")

Top 1 reranked results for MiniLM-L-6-v2:
------------------------------------------------------------
Result 1:
Score: 3.76
Content: json encoding issue in python
<p>i am attempting a custom encode, but get an error. the following code sample generates an error:</p>

<pre><code>#!/usr/bin/python3

import json

class contact:
  def ...
Tags: python|json
Answer: N/A
Code Block: Yes
Top 1 reranked results for bge-reranker-base:
------------------------------------------------------------
Result 1:
Score: 0.70
Content: <p>using <a href="http://docs.python-requests.org/en/master/" rel="nofollow noreferrer">python-requests</a></p>

<p>this code merges all the <code>json</code> received from the urls into one <code>fin...
Tags: python|json
Answer: N/A
Code Block: Yes


Implement another reranker: the ColBERT (Contextualized Late Interaction over BERT) using HuggingFace (not a full FAISS retrieval).

In [11]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("colbert-ir/colbertv2.0")
colbert_model = AutoModel.from_pretrained("colbert-ir/colbertv2.0").to(device)
colbert_model.eval()

@torch.no_grad()
def get_colbert_encoding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    outputs = colbert_model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :]  #use the first token's embedding
    return embeddings.cpu()

def colbert_score(query, docs):
    query_embedding = get_colbert_encoding(query)
    doc_embeddings = torch.cat([get_colbert_encoding(doc) for doc in docs], dim=0) 

    query_embedding = query_embedding.expand(doc_embeddings.size(0), -1)

    similarity_scores = F.cosine_similarity(query_embedding, doc_embeddings, dim=1)

    return similarity_scores.tolist()

# Compare ColBERT scores with cross-encoder scores on this query text
query_text = "how to parse json in python"

# Already done in previous two cells:
'''
initial_results = collection.query(query_embeddings=[model.encode(query_text.lower()).tolist()], n_results=20)
docs = initial_results['documents'][0]
distances = initial_results['distances'][0]
'''

colbert_scores = colbert_score(query_text, docs)
colbert_ranked = sorted(zip(docs, colbert_scores), key=lambda x: x[1], reverse=True)

number_of_results = 2
print('ColBert Ranked Results:')
for i, (doc, score) in enumerate(colbert_ranked[:number_of_results]):
    print('-' * 60)
    print(f"Result {i+1}:")
    print(f"Score: {score:.2f}")
    print(f"Content: {doc[:200]}...")
    print(f"Code Block: {'Yes' if meta.get('code', False) else 'No'}")

ColBert Ranked Results:
------------------------------------------------------------
Result 1:
Score: 0.73
Content: import json

foo = {none: 7, 'bar': 8}
# {'bar': 8, none: 7}

foo_json = json.dumps(foo)
# '{"bar": 8, "null": 7}'

foo_prime = json.loads(foo_json)
# {'null': 7, 'bar': 8}

foo_sorted = json.dumps(fo...
Code Block: Yes
------------------------------------------------------------
Result 2:
Score: 0.73
Content: import json

foo = {none: 7, 'bar': 8}
# {'bar': 8, none: 7}

foo_json = json.dumps(foo)
# '{"bar": 8, "null": 7}'

foo_prime = json.loads(foo_json)
# {'null': 7, 'bar': 8}

foo_sorted = json.dumps(fo...
Code Block: Yes


In [12]:
!pip install torch transformers --index-url https://download.pytorch.org/whl/cpu
! pip install accelerate

Looking in indexes: https://download.pytorch.org/whl/cpu


In [13]:
!pip install transformers accelerate



Test the whole RAG pipeline on a small LLM that runs on CPU. Compare results to plain LLM, to see if retrieval even helps.

In [None]:
# for testing only
from transformers import AutoTokenizer, AutoModelForCausalLM

class RAG:
    def __init__(self, embedder, collection, reranker, retrieve_number=3, num_reranked_docs=2, gpu_based=False, history_length=3):
        model_id = "stabilityai/stablelm-2-zephyr-1_6b" if gpu_based else "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.device = "cuda" if self.gpu_based else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.llm = AutoModelForCausalLM.from_pretrained(model_id, device_map=self.device)
        self.embedder = embedder
        self.retriever = collection
        self.reranker = reranker
        self.retrieve_number = retrieve_number
        self.num_reranked_docs = num_reranked_docs
        self.gpu_based = gpu_based
        self.history = []
        self.history_length = history_length

    def generate(self, query):
        query_embedding = self.embedder.encode(query.lower()).tolist()
        results = self.retriever.query(query_embeddings=[query_embedding], n_results=self.retrieve_number)
        # reranking
        docs = results['documents'][0]
        metadatas = results['metadatas'][0]
        pairs = [(query, doc) for doc in docs]
        scores = self.reranker.predict(pairs)
        scores_ranked = sorted(zip(docs, metadatas, scores), key=lambda x: x[2], reverse=True)
        top_docs_metas = scores_ranked[:self.num_reranked_docs]
        # repackage top_docs_metas
        reranked_results = {
        "documents": [[doc for doc, meta, _ in top_docs_metas]],
        "metadatas": [[meta for doc, meta, _ in top_docs_metas]]
        }
        prompt = self.build_prompt(query, reranked_results)
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        outputs = self.llm.generate(**inputs, max_new_tokens=200)
        output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        output = output.split("Correct answer:")[1].strip()
        self.history.append((query, output))
        self.history = self.history[-self.history_length:]
        return output

    def context_from_results(self, results):
        contexts = []
        for document, metadata in zip(results["documents"], results["metadatas"]):
            metadata = metadata[0]
            document = document[0]
            if metadata["question"]:
                contexts.append(metadata["answer"])
            else:
                contexts.append(document)
        return contexts

    def build_prompt(self, query, results):
        contexts = self.context_from_results(results)
        history_str = ""
        if self.history:
            history_str = "\n".join([f"User: {q}\nAssistant: {a}" for q, a in self.history]) + "\n"

        return f'''
            You are a helpful coding assistant specializing in Python and software engineering.
            Always include a relevant code example if appropriate.
            If you do not know the answer, say "I don't know." Do not fabricate information.
            If needed, ask the user for clarification.
            Previous conversation:
            {history_str}
            Use the following context snippets (if relevant) to answer the user's current question.
            Context snippets:
            \"\"\"\n{contexts}\"\"\"

            Current question: {query}

            Correct answer:'''.strip()

In [15]:
class BasicLLM:
    def __init__(self, gpu_based=False):
        model_id = "stabilityai/stablelm-2-zephyr-1_6b" if gpu_based else "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        self.device = "cuda" if self.gpu_based else "cpu"
        self.llm = AutoModelForCausalLM.from_pretrained(model_id, device_map=self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.gpu_based = gpu_based

    def generate(self, query):
        inputs = self.tokenizer(query, return_tensors="pt").to(self.device)
        outputs = self.llm.generate(**inputs, max_new_tokens=200)
        output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return output.split("?")[1]


In [16]:
# tuki nej bo vprašaj na koncu (nej bo samo en vprašaj)
query = "How to parse json in python?"

code_llm = RAG(model, collection)
answer = code_llm.generate(query)
print(f"RAG answer:\n{answer}")

print("\n")
basicLLM = BasicLLM()
basic_answer = basicLLM.generate(query)
print(f"Basic answer:\n{basic_answer}")

AttributeError: 'RAG' object has no attribute 'gpu_based'

Evaluation:
- prepare questions
- get answers
- evaluate

In [18]:
simple_questions = ["How does Python's if __name__ == '__main__': work?",
"Explain the use of *args and **kwargs in function definitions.",
"What is the difference between .loc[] and .iloc[] in Pandas?",
"How do you handle missing values in a DataFrame?",
"How do you write unit tests in Python using unittest?",
"What is the purpose of __init__.py in a Python package?",
"How does inheritance work in Python classes?",
"What is the purpose of __str__ and __repr__ methods?",
"What is a virtual environment and how do you use it?",
"What is NumPy and how is it used?",
"How can I set a fixed sized font for text in my figure using matplotlib?",
"How do you train a neural network using pyTorch?",
"How do you merge two dataframes in Pandas?",
"How do you save a sci-kit learn model?"]

simple_answers = [
    "The statement `if __name__ == '__main__':` checks whether the script is being run directly (not imported as a module). If so, the code under this block will execute.",
    "`*args` allows a function to accept any number of positional arguments, and `**kwargs` allows it to accept any number of keyword arguments.",
    "In Pandas, `.loc[]` is label-based indexing (using row and column names), while `.iloc[]` is integer position-based indexing.",
    "Missing values in a DataFrame can be handled using methods like `df.fillna()` to fill them or `df.dropna()` to remove rows or columns with missing data.",
    "To write unit tests with `unittest`, define a class that inherits from `unittest.TestCase` and write methods starting with `test_`. Then run tests using `unittest.main()`.",
    "`__init__.py` marks a directory as a Python package and can also be used to execute package initialization code.",
    "Inheritance allows a class (child) to inherit methods and properties from another class (parent) using `class Child(Parent):` syntax.",
    "`__str__` defines the human-readable string representation of an object, while `__repr__` defines the unambiguous representation used for debugging.",
    "A virtual environment is an isolated Python environment. You create it using `python -m venv env`, activate it, and install dependencies inside it.",
    "NumPy is a library for numerical computing in Python, offering fast operations on arrays and matrices, and tools for linear algebra, statistics, and more.",
    "In matplotlib, you can set a fixed-size font using: `plt.rcParams['font.size'] = 12` or by specifying `fontsize=12` in individual plot elements.",
    "In PyTorch, you define a model class, a loss function, and an optimizer, then train in a loop by doing forward pass, computing loss, backpropagation (`loss.backward()`), and `optimizer.step()`.",
    "Use `pd.merge(df1, df2, on='key')` to merge two dataframes on a common column, similar to SQL joins.",
    "You can save a scikit-learn model using `import joblib` and `joblib.dump(model, 'model.pkl')`. Load it later with `joblib.load('model.pkl')`."
]


semi_adv_instructions = ["Write code for removing all duplicate elements of a list.",
"Reverse a list in Python without using built-in methods?",
"Write a function to check if a string is a palindrome.",
"Check for duplicates in a specific column in pandas Dataframe",
"Make an HTTP GET request in Python?",
"Serialize a Python object using pickle",
"Train a linear regression model with Scikit-Learn",
"Find the maximum value in a python list",
"Write a Python code to check if a number is even or odd.",
"Write a Python program to find the intersection of two sets.",
"Implement a simple calculator in Python",
"Write a Python code to convert a list of temperatures from Celsius to Fahrenheit.",
"Save and load a NumPy array to and from a file"]

semi_adv_answers = [
    "You can remove duplicates using: `unique_list = list(set(original_list))`.",
    "You can reverse a list manually like this:\n```python\nreversed_list = [original_list[i] for i in range(len(original_list)-1, -1, -1)]\n```",
    "Here's a palindrome checker:\n```python\ndef is_palindrome(s):\n    return s == s[::-1]\n```",
    "Check duplicates in a column using:\n```python\nduplicates = df[df['column_name'].duplicated()]\n```",
    "Use the `requests` library:\n```python\nimport requests\nresponse = requests.get('https://example.com')\nprint(response.text)\n```",
    "Pickle an object with:\n```python\nimport pickle\nwith open('obj.pkl', 'wb') as f:\n    pickle.dump(obj, f)\n```",
    "Train linear regression:\n```python\nfrom sklearn.linear_model import LinearRegression\nmodel = LinearRegression()\nmodel.fit(X_train, y_train)\n```",
    "Find the max value with:\n```python\nmaximum = max(my_list)\n```",
    "Even/odd check:\n```python\ndef check_even_odd(n):\n    return 'Even' if n % 2 == 0 else 'Odd'\n```",
    "Find intersection:\n```python\nintersection = set1 & set2\n```",
    "Simple calculator:\n```python\ndef calc(a, b, op):\n    if op == '+': return a + b\n    elif op == '-': return a - b\n    elif op == '*': return a * b\n    elif op == '/': return a / b\n```",
    "Convert Celsius to Fahrenheit:\n```python\nfahrenheit = [c * 9/5 + 32 for c in celsius_list]\n```",    
    "Save/load NumPy array:\n```python\nimport numpy as np\nnp.save('array.npy', arr)\nloaded = np.load('array.npy')\n```"
]

advanced_instructions = ["Plan and provide code for a weather app. Provide an implemetation plan, design, code, instructions for hosting and anything else that might be needed.",
"Write complete code for RAG pipeline. Include unit tests for each step.",
"Can you explain backpropagation and provide a concrete teaching example?",
"How does the Cholesky decomposition work? Provide python code.",
"How would you explain RSA to a child?",
"Create me a project for my subject: Algorithms and data structures. Make it worth 3ECTS.",
"Using python, how can I fill my VIRT memory on a Linux machine? What about SHR?"]

advanced_answers = [
    "To build a weather app:\n1. **Plan**: Fetch real-time weather data using an API like OpenWeatherMap.\n2. **Design**: UI with city input, forecast display, and icons for weather types.\n3. **Code**: Use Python (Flask or FastAPI) for backend, and HTML/CSS/JavaScript or Streamlit for frontend.\n4. **Instructions**: Host on Heroku or Render. Set up API keys securely. Use requests for API calls.\n5. **Extra**: Add caching, error handling, unit tests, and optional location auto-detection.",
    "A basic RAG pipeline includes: document chunking, embedding generation (e.g., with SentenceTransformers), vector store retrieval (e.g., FAISS), and LLM integration.\nUnit tests should cover:\n- Chunking edge cases\n- Embedding shape and type checks\n- Retrieval relevance accuracy\n- Response generation fidelity using mocked LLM output.",
    "Backpropagation is how neural networks learn: it computes gradients of the loss with respect to weights by applying the chain rule layer-by-layer backward.\nExample: For a 2-layer MLP, manually compute derivatives of the loss (e.g., MSE) with respect to weights and biases using a small input/output example.",
    "Cholesky decomposition factorizes a symmetric positive-definite matrix `A` into `L * L.T`, where `L` is lower triangular.\nExample in Python:\n```python\nimport numpy as np\nA = np.array([[4, 2], [2, 3]])\nL = np.linalg.cholesky(A)\n```",
    "RSA explained for a child:\nImagine a locked mailbox. Anyone can drop a letter in (encrypt with your public key), but only you have the key to open it (your private key).",
    "A 3 ECTS project idea:\n**Title**: 'Visual Algorithm Simulator'\n**Scope**: Implement and visualize sorting/searching algorithms, graph traversal, and data structures using Python + Tkinter or web (e.g., React + Flask).\n**Deliverables**: Code, report, performance benchmarks, usage instructions.\n**Learning Outcomes**: Algorithm analysis, data structure design, visualization, testing.",    
    "In Python on Linux:\n- **To fill VIRT memory** (virtual address space): allocate large unused arrays, e.g., `big = bytearray(10**9)`.\n- **To fill SHR (shared memory)**: spawn multiprocessing workers or load shared libraries heavily reused across processes."
]

debugging_questions = [
"""fix my code:
import asyncio
import aiohttp

async def fetch_data():
    response = aiohttp.ClientSession().get('https://example.com')
    data = await response.text()
    print(data)

asyncio.run(fetch_data())""",
"""
fix my code:
df = pd.DataFrame({'A': [1, 2, 3]})
df[df['A'] > 1]['A'] = 0
print(df)""",
"""fix my code:
def append_item(item, lst=[]):
    lst.append(item)
    return lst

print(append_item(1))
print(append_item(2))
""",
"""fix my code:
def outer():
    x = 10
    def inner():
        print(x)
        x += 1
    inner()

outer()
""",
"""
fix my error: RuntimeError: This event loop is already running
for code:
import asyncio

async def say_hello():
    await asyncio.sleep(1)
    return "Hello"

def main():
    loop = asyncio.get_event_loop()
    result = loop.run_until_complete(say_hello())
    print(result)

main()
""",
"""
fix my error: AttributeError: 'Person' object has no attribute 'gender'
for code:
class Person:
    __slots__ = ['name', 'age']

    def __init__(self, name, age):
        self.name = name
        self.age = age

p = Person("Alice", 30)
p.gender = "female"
"""]

debugging_answers = [
    # 1. Async/Aiohttp fix
    """import asyncio
import aiohttp

async def fetch_data():
    async with aiohttp.ClientSession() as session:
        async with session.get('https://example.com') as response:
            data = await response.text()
            print(data)

asyncio.run(fetch_data())""",

    # 2. Pandas chained assignment fix
    """import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3]})
df.loc[df['A'] > 1, 'A'] = 0
print(df)""",

    # 3. Mutable default argument fix
    """def append_item(item, lst=None):
    if lst is None:
        lst = []
    lst.append(item)
    return lst

print(append_item(1))
print(append_item(2))""",

    # 4. UnboundLocalError in closure fix
    """def outer():
    x = 10
    def inner():
        nonlocal x
        print(x)
        x += 1
    inner()

outer()""",

    # 5. Event loop already running fix (for notebook environments)
    """import asyncio

async def say_hello():
    await asyncio.sleep(1)
    return "Hello"

# For environments like Jupyter:
import nest_asyncio
nest_asyncio.apply()

async def main():
    result = await say_hello()
    print(result)

asyncio.run(main())""",

    # 6. __slots__ attribute error fix
    """class Person:
    __slots__ = ['name', 'age', 'gender']

    def __init__(self, name, age):
        self.name = name
        self.age = age

p = Person("Alice", 30)
p.gender = "female"
"""
]


In [None]:
# RAG
simple_responses = [code_llm.generate(query) for query in simple_questions]
semi_adv_responses = [code_llm.generate(query) for query in semi_adv_instructions]
advanced_responses = [code_llm.generate(query) for query in advanced_instructions]

# Basic LLM
simple_responses_basic = [basicLLM.generate(query) for query in simple_questions]
semi_adv_responses_basic = [basicLLM.generate(query) for query in semi_adv_instructions]
advanced_responses_basic = [basicLLM.generate(query) for query in advanced_instructions]


results_dict = {
    "RAG": {
        "simple": [
            {"question": q, "ground_truth": gt, "generated_answer": ans}
            for q, gt, ans in zip(simple_questions, simple_answers, simple_responses)
        ],
        "semi_advanced": [
            {"question": q, "ground_truth": gt, "generated_answer": ans}
            for q, gt, ans in zip(semi_adv_instructions, semi_adv_answers, semi_adv_responses)
        ],
        "advanced": [
            {"question": q, "ground_truth": gt, "generated_answer": ans}
            for q, gt, ans in zip(advanced_instructions, advanced_answers, advanced_responses)
        ]
    },
    "BasicLLM": {
        "simple": [
            {"question": q, "ground_truth": gt, "generated_answer": ans}
            for q, gt, ans in zip(simple_questions, simple_answers, simple_responses_basic)
        ],
        "semi_advanced": [
            {"question": q, "ground_truth": gt, "generated_answer": ans}
            for q, gt, ans in zip(semi_adv_instructions, semi_adv_answers, semi_adv_responses_basic)
        ],
        "advanced": [
            {"question": q, "ground_truth": gt, "generated_answer": ans}
            for q, gt, ans in zip(advanced_instructions, advanced_answers, advanced_responses_basic)
        ]
    }
}

SUFFIX = f"max-{MAX_DOCS}_ndocs-{retrieve_number}_ntop-{num_reranked_docs}"
# Save to json
with open(f"outputs_rag_basic_{SUFFIX}.json", "w", encoding="utf-8") as file:
    json.dump(results_dict, file, indent=4, ensure_ascii=False)


"""
reference_answers = {
    model_type: {
        category: [item["ground_truth"] for item in items]
        for category, items in results.items()
    }
    for model_type, results in results_dict.items()
}

generated_answers = {
    model_type: {
        category: [item["generated_answer"] for item in items]
        for category, items in results.items()
    }
    for model_type, results in results_dict.items()
}
"""


def rouge(predictions, references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    for pred, ref in zip(predictions, references):
        scores = scorer.score(ref, pred)
        yield scores['rouge1'].fmeasure, scores['rougeL'].fmeasure

def bert(predictions, references):
    P, R, F1 = bert_score(predictions, references, lang='en', rescale_with_baseline=True)
    return P.mean().item(), R.mean().item(), F1.mean().item()

def evaluation(predictions, references):
    rouge_scores = list(rouge(predictions, references))
    # mean rouge results
    rouge1_f1 = np.mean([score[0] for score in rouge_scores])
    rougeL_f1 = np.mean([score[1] for score in rouge_scores])
    
    bert_scores = bert(predictions, references)
    bert_precision, bert_recall, bert_f1 = bert_scores
    
    # evaluate on all questions
    
    return {
        "rouge1": rouge1_f1,
        "rougeL": rougeL_f1,
        "bert_precision": bert_precision,
        "bert_recall": bert_recall,
        "bert_f1": bert_f1
    }

evaluation_results = {}
for model_type, categories in results_dict.items():
    model_evaluation = {}
    for category, items in categories.items():
        references = [item["ground_truth"] for item in items]
        predictions = [preprocess(item["generated_answer"]) for item in items]
        model_evaluation[category] = evaluation(predictions, references)
    evaluation_results[model_type] = model_evaluation

# save evaluation results
with open(f"evaluation_results_rag_basic_{SUFFIX}.json", "w", encoding="utf-8") as file:
    json.dump(evaluation_results, file, indent=4, ensure_ascii=False)

total_evaluation_results = {}
for model_type, categories in evaluation_results.items():
    for category, items in categories.items():
        if category not in total_evaluation_results:
            total_evaluation_results[category] = {
                "rouge1": 0.0,
                "rougeL": 0.0,
                "bert_precision": 0.0,
                "bert_recall": 0.0,
                "bert_f1": 0.0
            }
        for key, value in items.items():
            total_evaluation_results[category][key] += value / len(evaluation_results)

# save evaluation results
with open(f"total_evaluation_results_{SUFFIX}.json", "w", encoding="utf-8") as file:
    json.dump(total_evaluation_results, file, indent=4, ensure_ascii=False)


In [19]:
simple_responses = [code_llm.generate(query) for query in simple_questions]
for ans, gt in zip(simple_responses, simple_answers):
    print("RAG output:")
    print(ans)
    print("---------------------")
    print("ChatGPT (GPT-4 turbo) output:")
    print(gt)
    print("----------------------------")
    print("----------------------------")
    print("----------------------------")
semi_adv_responses = [code_llm.generate(query) for query in semi_adv_instructions]
for ans, gt in zip(semi_adv_responses, semi_adv_answers):
    print("RAG output:")
    print(ans)
    print("---------------------")
    print("ChatGPT (GPT-4 turbo) output:")
    print(gt)
    print("----------------------------")
    print("----------------------------")
    print("----------------------------")
advanced_responses = [code_llm.generate(query) for query in advanced_instructions]
for ans, gt in zip(advanced_responses, advanced_answers):
    print("RAG output:")
    print(ans)
    print("---------------------")
    print("ChatGPT (GPT-4 turbo) output:")
    print(gt)
    print("----------------------------")
    print("----------------------------")
    print("----------------------------")

NameError: name 'code_llm' is not defined