In [None]:
# !pip install -q transformers accelerate bitsandbytes langchain langgraph
# !pip install -q datasets

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Make sure we have a GPU
if not torch.cuda.is_available():
    raise SystemError("GPU not found. Go to Runtime > Change runtime type > GPU")
print("GPU:", torch.cuda.get_device_name(0))

In [None]:
from google.colab import userdata
from huggingface_hub import login

login(userdata.get('HUGGINGFACE_HUB_TOKEN'))


In [None]:
model_id = "bigcode/starcoderbase-3b"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,           # enables 4-bit quantization
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",   # NormalFloat4
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto"
)

print("Model loaded in 4-bit successfully!")


In [None]:
inputs = tokenizer("def fibonacci(n):", return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=50,
    do_sample=True,
    temperature=0.7,
    top_p=0.95
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:

!pip install -qU langchain-community langchain-core langchain-huggingface

In [None]:
!pip install -qU langchain-community langchain-core langchain-huggingface

In [None]:
#### TESTING REPL #####

print("\n--- Starting the Simple REPL Test ---")

from langchain_experimental.utilities import PythonREPL

print("Creating PythonREPL instance...")
python_repl = PythonREPL()

code_to_run = "print(1+1)"
print(f"Executing code: '{code_to_run}'")

result = python_repl.run(code_to_run)

print("\n--- Output from PythonREPL ---")
print(result)
print("------------------------------")

In [None]:

!pip install -qU langchain

from langchain.tools import Tool
from langchain_experimental.utilities import PythonREPL

python_repl_utility = PythonREPL()

repl_tool = Tool(
    name="python_repl",
    description="A Python REPL. Use this to execute python code. The input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
    func=python_repl_utility.run
)

print("✅ Successfully wrapped the Python REPL in a LangChain Tool.")
print(f"\nTool Name: {repl_tool.name}")
print(f"Tool Description: {repl_tool.description}")

print("\n--- Testing the new Tool object ---")
test_result = repl_tool.run("print(5 * 5)")
print(f"Result of 'repl_tool.run(\"print(5 * 5)\")':\n{test_result}")
print("---------------------------------")

In [None]:

print("--- Preparing a prompt to test StarCoder's tool-using ability ---")

question = "Calculate 25 raised to the power of 0.5 and tell me the result."

prompt_template = f"""
You are a helpful assistant that can use tools. You have access to the following tool:

Tool Name: {repl_tool.name}
Tool Description: {repl_tool.description}

When you need to use a tool, you MUST respond in the following format:

Action: [the name of the tool, e.g., python_repl]
Action Input: [the input to the tool, e.g., print(2 + 2)]

Here is your task:
{question}
"""

print("--- This is the full prompt the model will see ---")
print(prompt_template)
print("-------------------------------------------------")

inputs = tokenizer(prompt_template, return_tensors="pt").to(model.device)

outputs = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    pad_token_id=tokenizer.eos_token_id
)

# Decode the output and print it.
print("\n--- Raw output generated by StarCoder ---")

response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response_text)
print("-----------------------------------------")


print("\n--- Verification ---")
if "Action: python_repl" in response_text and "25 ** 0.5" in response_text:
    print("✅ SUCCESS! The model generated the correct Action and Action Input.")
    print("This confirms that we can successfully prompt StarCoder to use our tool.")
else:
    print("❌ FAILED. The model did not generate the expected output format.")
    print("You might need to adjust the prompt or the generation parameters (like temperature).")

In [None]:

!pip install -qU datasets langchain langchain-experimental

from datasets import load_dataset
from langchain.tools import Tool
from langchain_experimental.utilities import PythonREPL
import re

python_repl_utility = PythonREPL()
repl_tool = Tool(
    name="python_repl",
    description="A Python REPL. Use this to execute python code. The input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
    func=python_repl_utility.run
)

print("✅ Setup complete. REPL tool and datasets library are ready.")

print("\n--- Loading a problem from the HumanEval benchmark ---")

# Load the entire dataset
humaneval_dataset = load_dataset("openai_humaneval")

# Problem: "has_close_elements" - Check if any two elements in a list are closer than a threshold.
problem_index = 0
problem = humaneval_dataset["test"][problem_index]

# The 'prompt' contains the function signature and docstring that the model should complete.
humaneval_prompt = problem["prompt"]
# The 'test' contains the Python unit test code that verifies the solution.
humaneval_test = problem["test"]

print(f"Loaded HumanEval Problem #{problem_index}: {problem['entry_point']}")
print("\n--- Function Stub and Docstring (Given to Model) ---")
print(humaneval_prompt)
print("\n--- Official Unit Test (Used for Verification) ---")
print(humaneval_test)


print("\n--- Prompting the base StarCoder model to solve the problem ---")

prompt_template = f"""
You are an expert Python programmer. Your task is to complete the following function based on the docstring.
You must respond with a plan and the code to execute in the python_repl tool.

Use the following format:
Action: python_repl
Action Input: [your completed python code]

Here is the function to complete:
{humaneval_prompt}
"""

# Prepare the prompt for the model
inputs = tokenizer(prompt_template, return_tensors="pt").to(model.device)

# Generate the model's response
outputs = model.generate(
    **inputs,
    max_new_tokens=256, # Give it enough room for a full function
    temperature=0.2,    # Low temperature for more deterministic, less "creative" code
    pad_token_id=tokenizer.eos_token_id
)

# Decode and print the model's raw response
model_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n--- Model's Raw Generated Response ---")
print(model_response)

print("\n--- Evaluating the model's solution ---")

# Use regex to find and extract only the Python code from the model's response.
# We look for the code inside the "Action Input" block.
code_match = re.search(r"Action Input:\s*```python\n(.*?)\n```", model_response, re.DOTALL)
if not code_match:
    # A more lenient search if the above fails
    code_match = re.search(r"Action Input:\s*(.*)", model_response, re.DOTALL)


if code_match:
    generated_code = code_match.group(1).strip()
    print("\n--- Extracted Code to be Tested ---")
    print(generated_code)

    # Combine the model's generated function with the official unit test
    full_test_code = generated_code + "\n\n" + humaneval_test

    print("\n--- Executing Full Test Code in REPL ---")
    print(full_test_code)
    print("----------------------------------------")

    # Run the combined code using our REPL tool
    # The unit test uses `assert`, which will raise an error if the test fails.
    try:
        result = repl_tool.run(full_test_code)
        # If the code runs without error and the result is empty, the assert passed.
        # Sometimes the REPL might return an object representation, but an AssertionError is the key failure signal.
        print(f"\nREPL Output: {result}")
        if "AssertionError" in result:
             print("\n[VERDICT] ❌ FAIL: The generated code failed the unit test (AssertionError).")
        else:
             print("\n[VERDICT] ✅ PASS: The generated code ran successfully and passed the unit test.")

    except Exception as e:
        # If any other error occurs during execution.
        print(f"\nAn error occurred during execution: {e}")
        print("\n[VERDICT] ❌ FAIL: The generated code produced an error during execution.")

else:
    print("\n[VERDICT] ❌ FAIL: Could not parse the generated code from the model's response. The model did not follow the required format.")

In [None]:
##### ERRRORRRR CAUSING CODE - THIS CELL IGNORE ############

from datasets import load_dataset
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain.llms import HuggingFacePipeline
from langchain_experimental.agents.agent_toolkits.python.base import create_python_agent
from transformers import TextGenerationPipeline
import re
python_repl_tool = PythonREPLTool()

hf_pipeline = TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",  # explicitly set task
    temperature=0.2,
    max_new_tokens=256
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

agent = create_python_agent(
    llm=llm,
    tool=python_repl_tool,
    verbose=True
)

humaneval_dataset = load_dataset("openai_humaneval")
problem_index = 0  # choose any index
problem = humaneval_dataset["test"][problem_index]

humaneval_prompt = problem["prompt"]
humaneval_test = problem["test"]
entry_point = problem["entry_point"]

print(f"Loaded HumanEval Problem #{problem_index}: {entry_point}")
print("\n--- Function Stub ---")
print(humaneval_prompt)
print("\n--- Unit Test ---")
print(humaneval_test)

prompt_template = f"""
You are an expert Python programmer.
Your task is to complete the following function.
ONLY output the Python code. Do NOT include explanations, thoughts, or 'Final Answer' text.
Do not include markdown or ```python fences.
Here is the function to complete:

{humaneval_prompt}
"""

model_response = agent.run(input=prompt_template, handle_parsing_errors=True)

print("\n--- Model Generated Code ---")
print(model_response)

lines = model_response.splitlines()
generated_code = "\n".join(line for line in lines if not line.strip().startswith("Final Answer"))

print("\n--- Cleaned Generated Code ---")
print(generated_code)
# ==============================================================================
full_test_code = generated_code + "\n\n" + humaneval_test

print("\n--- Executing Combined Code (Generated + Unit Test) ---")
print("----------------------------------------")

# Run in REPL and check results
try:
    result = python_repl_tool.run(full_test_code)
    print(f"\nREPL Output:\n{result}")
    if "AssertionError" in result:
        print("\n[VERDICT] ❌ FAIL: The generated code failed the unit test (AssertionError).")
    else:
        print("\n[VERDICT] ✅ PASS: The generated code passed the unit test.")
except Exception as e:
    print(f"\nAn error occurred during execution: {e}")
    print("\n[VERDICT] ❌ FAIL: The generated code produced an error.")


In [None]:
from datasets import load_dataset
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain.llms import HuggingFacePipeline
from transformers import TextGenerationPipeline
import re


python_repl_tool = PythonREPLTool()

hf_pipeline = TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    max_new_tokens=256
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

humaneval_dataset = load_dataset("openai_humaneval")
problem_index = 0
problem = humaneval_dataset["test"][problem_index]

prompt = problem["prompt"]
unit_test = problem["test"]

print(f"Loaded HumanEval problem #{problem_index}: {problem['entry_point']}\n")
print("--- Function Stub ---")
print(prompt)


clean_prompt = f"""
You are an expert Python programmer.
ONLY output valid Python code for the following function.
Do NOT include explanations, thoughts, markdown, or Final Answer text.

Function to complete:

{prompt}
"""


model_response = llm(clean_prompt)
generated_code = model_response.strip()

print("\n--- Generated Code ---")
print(generated_code)

full_code = generated_code + "\n\n" + unit_test

try:
    result = python_repl_tool.run(full_code)
    print("\n--- REPL Output ---")
    print(result)
    if "AssertionError" in result:
        print("\n[VERDICT] ❌ FAIL")
    else:
        print("\n[VERDICT] ✅ PASS")
except Exception as e:
    print(f"\nError during execution: {e}")
    print("\n[VERDICT] ❌ FAIL")


In [None]:
from datasets import load_dataset
from langchain_experimental.tools.python.tool import PythonREPLTool
from langchain.llms import HuggingFacePipeline
from transformers import TextGenerationPipeline

python_repl_tool = PythonREPLTool()
hf_pipeline = TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",
    temperature=0.2,
    max_new_tokens=256
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

humaneval_dataset = load_dataset("openai_humaneval")
test_set = humaneval_dataset["test"]


N = 5
if N is not None:
    test_set = test_set.select(range(N))


test_set_list = [dict(row) for row in test_set]

for idx, problem in enumerate(test_set_list):
    prompt = problem["prompt"]
    unit_test = problem["test"]
    entry_point = problem["entry_point"]
    print(f"\n=== Problem #{idx}: {entry_point} ===\n")

    clean_prompt = f"""
You are an expert Python programmer.
ONLY output valid Python code for the following function.
Do NOT include explanations, thoughts, markdown, or Final Answer text.

Function to complete:

{prompt}
"""

    # Generate code
    try:
        model_response = llm(clean_prompt)  # returns string
        generated_code = model_response.strip()  # treat as string
    except Exception as e:
        print(f"Error during generation: {e}")
        results.append((entry_point, False, "Generation error"))
        continue

    print("\n--- Generated Code ---")
    print(generated_code)

    # Combine generated code with unit test
    full_code = generated_code + "\n\n" + unit_test

    # Run in Python REPL
    try:
        output = python_repl_tool.run(full_code)
        passed = "AssertionError" not in output
        results.append((entry_point, passed, output))
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"\n{status}\n")
    except Exception as e:
        results.append((entry_point, False, f"Execution error: {e}"))
        print(f"\n❌ FAIL: Execution error: {e}\n")

print("\n=== HumanEval Summary ===\n")
for entry_point, passed, _ in results:
    status = "✅ PASS" if passed else "❌ FAIL"
    print(f"{entry_point}: {status}")

# Total passed
total_passed = sum(1 for _, passed, _ in results if passed)
print(f"\nTotal Passed: {total_passed} / {len(results)}")


In [None]:
#error...

# ==============================================================================
# Step 1: Install All Necessary Libraries
# ==============================================================================
# !pip install -qU torch transformers bitsandbytes accelerate
# !pip install -qU langchain langchain-experimental langgraph datasets langchain-huggingface

# print("✅ All libraries installed.")


# ==============================================================================
# Step 2: All Necessary Imports
# ==============================================================================
import operator
import re
from typing import TypedDict, Annotated, Sequence

# For model loading
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline

# For the agent components
from langchain.tools import Tool
from langchain_experimental.utilities import PythonREPL
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from langchain_huggingface import HuggingFacePipeline

# For the agent graph and evaluation
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode
from datasets import load_dataset

print("✅ All modules imported.")


# ==============================================================================
# Step 3: Load Model, Tokenizer, and Create the REPL Tool
# ==============================================================================
# print("\n--- Loading base model and tokenizer ---")
# model_id = "bigcode/starcoderbase-3b"
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True, bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
# )
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(
#     model_id, quantization_config=bnb_config, device_map="auto"
# )
# print("✅ Model and tokenizer are ready.")

# # --- Create the REPL Tool ---
# python_repl_utility = PythonREPL()
# repl_tool = Tool(
#     name="python_repl",
#     description="A Python REPL. Use this to execute python code. The input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.",
#     func=python_repl_utility.run
# )
# print("✅ Python REPL tool is ready.")


# ==============================================================================
# Step 4: Build the Complete A.C.E. Agent (The Reusable Problem Solver)
# ==============================================================================
print("\n--- Building the autonomous A.C.E. agent ---")

# --- Create LangChain Pipeline and bind the tool ---


hf_pipeline = TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    task="text-generation",  # explicitly set
    temperature=0.2,
    pad_token_id=tokenizer.eos_token_id,
    max_new_tokens=256
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

# --- Define Prompt, State, Nodes, and Logic ---
agent_prompt = PromptTemplate.from_template("""
You are A.C.E., an expert Python programming agent. Your goal is to solve the user's task.
You MUST call the python_repl tool with the complete, runnable Python code to solve the problem.
The code you provide should include the function definition and the necessary checks or assertions.

User's Task: {input}
""")
agent_chain = agent_prompt | llm_with_tools

class AgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]

def call_model(state):
    response = agent_chain.invoke({"input": state['messages'][-1].content})
    return {"messages": [response]}

tool_node = ToolNode([repl_tool])

def should_continue(state):
    if hasattr(state['messages'][-1], "tool_calls") and state['messages'][-1].tool_calls:
        return "continue"
    else:
        return "end"

# --- Assemble and Compile the Agent Graph ---
workflow = StateGraph(AgentState)
workflow.add_node("agent", call_model)
workflow.add_node("action", tool_node)
workflow.set_entry_point("agent")
workflow.add_conditional_edges("agent", should_continue, {"continue": "action", "end": END})
workflow.add_edge('action', 'agent')
app = workflow.compile()

print("✅ A.C.E. agent has been built and compiled.")


# ==============================================================================
# Step 5: Run the HumanEval Loop Using the A.C.E. Agent
# ==============================================================================
print("\n--- Starting HumanEval evaluation with the A.C.E. Agent ---")

humaneval_dataset = load_dataset("openai_humaneval")
test_set = humaneval_dataset["test"]

# --- Configuration ---
results = []
N = 5  # Set how many problems to evaluate
if N is not None:
    test_set = test_set.select(range(N))

# --- The Evaluation Loop ---
for idx, problem in enumerate(test_set):
    prompt = problem["prompt"]
    unit_test = problem["test"]
    entry_point = problem["entry_point"]
    print(f"\n=== Problem #{idx}: {entry_point} ===\n")

    # Define the full task for the agent
    # We tell the agent to complete the function AND ensure it passes the provided test
    full_task = f"""
Complete the following Python function:
{prompt}

Your solution must pass this unit test:
{unit_test}

Please provide the complete, runnable code including the function and the test in the python_repl tool.
"""
    # Create the initial input for the agent
    inputs = {"messages": [HumanMessage(content=full_task)]}

    # --- Invoke the AGENT, not the LLM directly ---
    try:
        final_state = app.invoke(inputs)
        # The agent's final message contains the result from the REPL
        final_tool_output = final_state['messages'][-1].content

        # Check for assertion errors in the tool's output
        passed = "AssertionError" not in final_tool_output
        results.append((entry_point, passed, final_tool_output))
        status = "✅ PASS" if passed else "❌ FAIL"
        print(f"\n--- Agent's Final Output ---\n{final_tool_output}")
        print(f"\n{status}\n")

    except Exception as e:
        results.append((entry_point, False, f"Agent invocation error: {e}"))
        print(f"\n❌ FAIL: Agent invocation error: {e}\n")

# ==============================================================================
# Step 6: Summary Report
# ==============================================================================
print("\n=== HumanEval Summary (Agent Performance) ===\n")
for entry_point, passed, _ in results:
    status = "✅ PASS" if passed else "❌ FAIL"
    print(f"{entry_point}: {status}")

# Calculate Pass@1
total_passed = sum(1 for _, passed, _ in results if passed)
pass_rate = (total_passed / len(results)) * 100
print(f"\nTotal Passed: {total_passed} / {len(results)}")
print(f"Pass@1 Rate: {pass_rate:.2f}%")