In [1]:
%%capture
%pip install -U transformers accelerate

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
import torch


base_model = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    return_dict=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
if model.config.pad_token_id is None:
    model.config.pad_token_id = model.config.eos_token_id

In [4]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [5]:
messages = [{"role": "user", "content": "Who is Vincent van Gogh?"}]

prompt = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

outputs = pipe(prompt, max_new_tokens=120, do_sample=True)

print(outputs[0]["generated_text"])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Who is Vincent van Gogh?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Vincent van Gogh (1853-1890) was a Dutch post-impressionist painter, widely considered one of the greatest painters in history. He is famous for his bold and expressive works, which often explored themes of nature, emotion, and the human condition.

**Early Life and Career**

Van Gogh was born in Groot-Zundert, Netherlands, to a Protestant pastor's family. He was the eldest of six children, and his early life was marked by a sense of isolation and loneliness. He struggled with mental health issues, including depression and anxiety, throughout his life.




In [6]:
import pandas as pd
import os

# Load the Excel file
file_path = '/kaggle/input/sample6/single_line_bugs.xlsx'  # Replace with your file path
data = pd.read_excel(file_path)

# Extract the 'Buggy Code' column
buggy_code_column = data["Buggy Code"].dropna()

def extract_snippets_with_start_end(code_column, start_delim="SNIPPET_START", end_delim="SNIPPET_END"):
    all_snippets = []
    current_snippet = []
    for line in code_column:
        if start_delim in line:
            current_snippet = []  # Start a new snippet
        elif end_delim in line:
            if current_snippet:  # If a snippet is being built, save it
                all_snippets.append("\n".join(current_snippet))
        else:
            current_snippet.append(line.rstrip())  # Preserve indentation
    return all_snippets

# Extract snippets
extracted_snippets = extract_snippets_with_start_end(buggy_code_column)

def query_model_for_snippet(snippet):
    messages = [
        {"role": "system", "content": "You are an Analyst who always extracts one buggy line in the provided Python code."},
        {"role": "user", "content": f"Please analyze the following code and locate the buggy line. Only output the buggy line and nothing else under the text 'Buggy Line:':\n\n```python\n{snippet}\n```"}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    outputs = pipe(prompt, max_new_tokens=120, do_sample=True)
    return outputs[0]["generated_text"]
   
results = []
for i, snippet in enumerate(extracted_snippets, start=1):
    print(f"Processing Snippet {i}...")

    # Query the model for the response
    response = query_model_for_snippet(snippet)
    
    # Debugging: Print the raw model response
    print(f"Raw Model Response for Snippet {i}:\n{response}\n{'-' * 50}")
    
    # Extract the buggy line from the response
    buggy_lines = []
    lines = response.split("\n")  # Split response into individual lines
    capture = False
    for line in lines:
        if line.strip() == "Buggy Line:":  # Check for "Buggy Line:" marker
            capture = True
            continue
        if capture:  # Capture the line(s) after "Buggy Line:"
            if line.startswith("```"):  # Skip opening/closing backticks
                continue
            if line.endswith("```"):  # Stop capturing at closing backticks
                buggy_lines.append(line.replace("```", "").strip())
                break
            buggy_lines.append(line.strip())
    
    # Combine buggy lines if there are multiple
    filtered_response = "\n".join(buggy_lines)

    # Append results only if a buggy line was found
    if filtered_response:
        results.append({"Snippet": snippet, "Buggy Line": filtered_response})
        print(f"Filtered Buggy Line(s) for Snippet {i}:\n{filtered_response}\n{'-' * 50}")
    else:
        print(f"No buggy line found for Snippet {i}\n{'-' * 50}")

# Save the results to an Excel file
results_df = pd.DataFrame(results)
output_path = "/kaggle/working/Buggy_Code_Model_Responses.xlsx"
results_df.to_excel(output_path, index=False, sheet_name="Buggy Lines")

print(f"Filtered buggy lines saved to: {output_path}")

Processing Snippet 1...
Raw Model Response for Snippet 1:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an Analyst who always extracts one buggy line in the provided Python code.<|eot_id|><|start_header_id|>user<|end_header_id|>

Please analyze the following code and locate the buggy line. Only output the buggy line and nothing else under the text 'Buggy Line:':

```python
def bitcount(n):
    count = 0
    while n > 0:
        n &= n - 1
        count += 1
    return count
```<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Buggy Line:
```python
n &= n - 1
```
--------------------------------------------------
Filtered Buggy Line(s) for Snippet 1:
n &= n - 1
--------------------------------------------------
Processing Snippet 2...
Raw Model Response for Snippet 2:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an Analyst who always extracts one buggy line in the provided Python code.<|eot_id|><|start_header_id|>user<|end_header_id|

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Raw Model Response for Snippet 9:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an Analyst who always extracts one buggy line in the provided Python code.<|eot_id|><|start_header_id|>user<|end_header_id|>

Please analyze the following code and locate the buggy line. Only output the buggy line and nothing else under the text 'Buggy Line:':

```python
from collections import deque as Queue
def breadth_first_search(startnode, goalnode):
    queue = None
    queue.append(startnode)
    nodesseen = set()
    nodesseen.add(startnode)
    while queue:
        node = queue.popleft()
        if node is goalnode:
            return True
        else:
            queue.extend(node for node in node.successors if node not in nodesseen)
            nodesseen.update(node.successors)
    return False
```<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Buggy Line:
```
queue = None
```
--------------------------------------------------
Filtered Buggy Line(s) for Snippet 9:

In [7]:
import smtplib
from email.message import EmailMessage

# Email Configuration
sender_email = "siddharthyadav555@gmail.com"  # Replace with your email
sender_password = "lsso oekq txis gzng"        # Replace with your email password or app-specific password
recipient_email = "siddharthyadav555@gmail.com"  # Replace with the recipient's email
file_path = "/kaggle/working/Buggy_Code_Model_Responses.xlsx"  # Replace with your file path

# Create the Email
msg = EmailMessage()
msg["Subject"] = "Buggy Code Model Responses"
msg["From"] = sender_email
msg["To"] = recipient_email
msg.set_content("Attached is the file containing the Buggy Code Model Responses.")

# Attach the File
try:
    with open(file_path, "rb") as f:
        file_data = f.read()
        file_name = file_path.split("/")[-1]
        msg.add_attachment(file_data, maintype="application", subtype="vnd.openxmlformats-officedocument.spreadsheetml.sheet", filename=file_name)
except FileNotFoundError:
    print(f"File not found: {file_path}")
    exit()

# Send the Email
try:
    with smtplib.SMTP_SSL("smtp.gmail.com", 465) as smtp:  # Gmail's SMTP server
        smtp.login(sender_email, sender_password)
        smtp.send_message(msg)
    print(f"Email sent successfully to {recipient_email}")
except Exception as e:
    print(f"Failed to send email: {e}")

Email sent successfully to siddharthyadav555@gmail.com


In [8]:
pip install emails

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting emails
  Downloading emails-0.6-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting cssutils (from emails)
  Downloading cssutils-2.11.1-py3-none-any.whl.metadata (8.7 kB)
Collecting chardet (from emails)
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Collecting premailer (from emails)
  Downloading premailer-3.10.0-py2.py3-none-any.whl.metadata (15 kB)
Collecting cssselect (from premailer->emails)
  Downloading cssselect-1.2.0-py2.py3-none-any.whl.metadata (2.2 kB)
Downloading emails-0.6-py2.py3-none-any.whl (56 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.2/56.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cssutils-2.11.1-py3-none-any.whl (385 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m385.7/385.7 kB[0m [31m17.0 MB/s