## Important Dependencies with their required versions

In [None]:
# ! pip install SQLAlchemy==2.0.26
# ! pip install langchain==0.1.6
# ! pip install langchain-community==0.0.19
# ! pip install langchain-core==0.1.22
# ! pip install chromadb
# ! pip install text_generation
# ! pip install sentence-transformers
# ! pip install python-dotenv
# ! pip install transformers
# ! pip install torch
# ! pip install langchain_community
# ! pip install pandas
# ! pip install astunparse
# ! pip install faiss-cpu
# ! pip install flask_cors
# ! pip install openai
# ! pip install pandas
# ! pip install sentence-transformers
# ! pip install datasets
# ! pip install langchain_openai

### LLAMA INDEX

In [None]:
# %pip install llama-index-llms-huggingface
# %pip install llama-index-llms-huggingface-api
# %pip install "transformers[torch]" "huggingface_hub[inference]"
# %pip install llama-index
# %pip install llama-index-llms-text-generation-inference

In [1]:
from llama_index.core import PromptTemplate

GenerateTestTemplate = """You are a python expert and your task is: Given the following description and python code:
Description:
{description}
Code:
'''python
{code}'''
Generate a class that contains at least 7 unit tests (where each test has only one assertion) written in python that acheive high coverage to find bugs, runtime errors or logical errors in the code according to the description and include any required imports.
Make sure to include the unit test call unittest.main() to run the tests. Do not include any import for the code under test.
The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```python" and "```" respectively:"""


Gen_UnitTest_with_FewShots_template = """You are python unit tester, Write at least 7 unit tests for a method under test. You follow my rules and orders and if you do not know the answer, don't make things UP!
I am going to give you a method under test as well as its description and you are going to follow the criteria that I give to you in the generation.
Criteria:
Write 7 test cases that capture the intent of the user and create asserts that match descrition.
Each test generated contains only one assertion.
Complete the unittest code till "unittest.main()" is called. Think before you end the response. I do not want any incomplete code.
Do not include import for the method under test in the unit tests.
Run the tests and see if they match the description. If not, change the assertions to match the description.

Method under test:
{code}

Description:
{description}

I am going to add similar functions and their corresponding test cases that you may need to use in your tests. You can use them in your tests.
{test_cases_of_few_shot}

I am going to to give you a template for your output where:
Replace **TESTMETHODUNDERTEST** with the right name for the class.
2- Replace **TEST_CASES_WITH_UNDERSTANDABLE_NAMES** with the test cases that you generated.
My template is:
```python
import unittest

class **TESTMETHODUNDERTEST**(unittest.TestCase):
    **TEST_CASES_WITH_UNDERSTANDABLE_NAMES**

if __name__ == '__main__':
    unittest.main()
```
Evaluate your generated test cases and change the assertions if needed to comply with the description. Do not make things up!
"""

def addMixtralTokens(template):
    """Adds Mixtral Special Tokens to the prompt in case of vanilla llm by API to try to prevent incomplete responses problem."""
    return "<s> [INST] " + template + "</s> [/INST]"


qa_template = PromptTemplate(addMixtralTokens(Gen_UnitTest_with_FewShots_template))

prompt = qa_template.format(description="Write a function that takes a list of integers and returns the sum of the list.", code="def sum_list(lst):\n    return sum(lst)", test_cases_of_few_shot="")
# print(prompt)

In [4]:
prompt = ("""You are a Python expert, and your task is to debug and improve the following Python unitTest code based on the given description, code snippet, and unit tests feedback:

*Description:*
    This function takes a list l and returns a list l' such that
    l' is identical to l in the indicies that are not divisible by three, while its values at the indicies that are divisible by three are equal
    to the values of the corresponding indicies of l, but sorted.
    >>> sort_third([1, 2, 3])
    [1, 2, 3]
    >>> sort_third([5, 6, 3, 4, 8, 9, 2])
    [2, 6, 3, 4, 8, 9, 5]

*Given Code under Test:*
'''python



def sort_third(l: list):
    l = list(l)
    l[::3] = sorted(l[::3])
    return l'''
You previously generated the following code as unit tests:

*Unit Tests:*
'''python

import unittest

class TestSortThird(unittest.TestCase):
    def test_sort_third_no_sorting(self):
        l = [1, 2, 3]
        self.assertEqual(sort_third(l), l)

    def test_sort_third_sorting_every_third(self):
        l = [5, 6, 3, 4, 8, 9, 2]
        sorted_l = [2, 6, 3, 4, 8, 3, 5]
        self.assertEqual(sort_third(l), sorted_l)

    def test_sort_third_sorting_first_third(self):
        l = [5, 6, 3, 4, 8, 9, 2]
        sorted_l = [5, 2, 3, 4, 8, 9, 6]
        self.assertNotEqual(sort_third(l), sorted_l)

    def test_sort_third_sorting_second_third(self):
        l = [5, 6, 3, 4, 8, 9, 2]
        sorted_l = [5, 6, 2, 4, 8, 9, 3]
        self.assertNotEqual(sort_third(l), sorted_l)

    def test_sort_third_empty_list(self):
        l = []
        self.assertEqual(sort_third(l), l)

    def test_sort_third_single_element_list(self):
        l = [5]
        self.assertEqual(sort_third(l), l)

    def test_sort_third_two_element_list(self):
        l = [5, 6]
        self.assertEqual(sort_third(l), l)

if __name__ == '__main__':
    unittest.main()
'''

After running the code with these tests, you received the following feedback based on the test output:
*Feedback:*
FAIL: test_sort_third_sorting_every_third (__main__.TestSortThird.test_sort_third_sorting_every_third)
Traceback (most recent call last):
  File "<string>", line 24, in test_sort_third_sorting_every_third
AssertionError: Lists differ: [2, 6, 3, 4, 8, 9, 5] != [2, 6, 3, 4, 8, 3, 5]

First differing element 5:
9
3

- [2, 6, 3, 4, 8, 9, 5]
?                 ^

+ [2, 6, 3, 4, 8, 3, 5]
?                 ^




Your goal is to revise the tests based on the feedback. Ensure to:

Address all highlighted bugs in the feedback.
Modify only the unit tests
Do not include new imports for the tests.
Preserve all existing functionality not related to the bugs.

Return your revised unit tests as only one formatted markdown code snippet without further explanation, surrounded by triple backticks and the word 'python'.
""")

In [None]:
from huggingface_hub import InferenceClient

client = InferenceClient(
    model="Qwen/Qwen2-7B-Instruct",
    token="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",
)


def inferFun(prompt):
    response = ""
    for message in client.chat_completion(
        messages=[{"role": "user", "content": prompt}],
        max_tokens=20_000,
        stream=False,
        temperature=0.1,
        # top_p = 0.6,
        ):
        
        response += message.choices[0].delta.content
    return response

inferFun(prompt)

In [5]:
from llama_index.core import PromptTemplate

GenerateTestTemplate = """You are a python expert and your task is: Given the following description and python code:
Description:
{description}
Code:
'''python
{code}'''
Generate a class that contains at least 7 unit tests (where each test has only one assertion) written in python that acheive high coverage to find bugs, runtime errors or logical errors in the code according to the description and include any required imports.
Make sure to include the unit test call unittest.main() to run the tests. Do not include any import for the code under test.
The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```python" and "```" respectively:"""


Gen_UnitTest_with_FewShots_template = """You are python unit tester, Write at least 7 unit tests for a method under test. You follow my rules and orders and if you do not know the answer, don't make things UP!
I am going to give you a method under test as well as its description and you are going to follow the criteria that I give to you in the generation.
Criteria:
Write 7 test cases that capture the intent of the user and create asserts that match descrition.
Each test generated contains only one assertion.
Complete the unittest code till "unittest.main()" is called. Think before you end the response. I do not want any incomplete code.
Do not include import for the method under test in the unit tests.
Run the tests and see if they match the description. If not, change the assertions to match the description.

Method under test:
{code}

Description:
{description}

I am going to add similar functions and their corresponding test cases that you may need to use in your tests. You can use them in your tests.
{test_cases_of_few_shot}

I am going to to give you a template for your output where:
Replace **TESTMETHODUNDERTEST** with the right name for the class.
2- Replace **TEST_CASES_WITH_UNDERSTANDABLE_NAMES** with the test cases that you generated.
My template is:
```python
import unittest

class **TESTMETHODUNDERTEST**(unittest.TestCase):
    **TEST_CASES_WITH_UNDERSTANDABLE_NAMES**

if __name__ == '__main__':
    unittest.main()
```
Evaluate your generated test cases and change the assertions if needed to comply with the description. Do not make things up!
"""

def addMixtralTokens(template):
    """Adds Mixtral Special Tokens to the prompt in case of vanilla llm by API to try to prevent incomplete responses problem."""
    return "<s> [INST] " + template + "</s> [/INST]"


qa_template = PromptTemplate(addMixtralTokens(Gen_UnitTest_with_FewShots_template))

prompt = qa_template.format(description="Write a function that takes a list of integers and returns the sum of the list.", code="def sum_list(lst):\n    return sum(lst)", test_cases_of_few_shot="")
print(prompt)

<s> [INST] You are python unit tester, Write at least 7 unit tests for a method under test. You follow my rules and orders and if you do not know the answer, don't make things UP!
I am going to give you a method under test as well as its description and you are going to follow the criteria that I give to you in the generation.
Criteria:
Write 7 test cases that capture the intent of the user and create asserts that match descrition.
Each test generated contains only one assertion.
Complete the unittest code till "unittest.main()" is called. Think before you end the response. I do not want any incomplete code.
Do not include import for the method under test in the unit tests.
Run the tests and see if they match the description. If not, change the assertions to match the description.

Method under test:
def sum_list(lst):
    return sum(lst)

Description:
Write a function that takes a list of integers and returns the sum of the list.

I am going to add similar functions and their correspo

In [None]:
# import os
# from typing import List, Optional

# from llama_index.llms.huggingface import HuggingFaceLLM
# from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# URL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# remotely_run = HuggingFaceInferenceAPI(model_name=URL, token=os.getenv("HUGGINGFACEHUB_API_TOKEN"), max_tokens=29000, temperature=0.5)

# completion_response = remotely_run.complete(prompt)
# print(completion_response)

## Imports and Setup and Initializing Agent

In [1]:
# auto reload instead of having to restart each time
%load_ext autoreload
%autoreload 2

In [1]:
from Configuration import *
from Benchmark.TestGenerator import *
from Benchmark.TestFix import *
from Benchmark.DecisionMaker import *
from Benchmark.BugFixBench import *
from Benchmark.BugFix import *

# llm_arb, chat_model_arb = InitializeModelArbiter(os.environ["HUGGINGFACEHUB_API_TOKEN"],repo_id='mistralai/Mixtral-8x7B-Instruct-v0.1')
# with open(r"Datasets/atcoder_problem_test_cases_description.jsonl", "r") as f:
#     data = f.read()
#     jsonObj = json.loads(data)
# jsonObj=pd.read_json(path_or_buf="Datasets/atcoder_problem_test_cases_description.jsonl")
# jsonObj=pd.read_json(path_or_buf="Datasets/cleanedhumaneval.json")
jsonObj = HEval_JsonObj
testGenerator = TestGenerator(GenUnitTestChain, db, jsonObj, globals())
# testGenerator.isHumanEval = True
# testRegenerator = TestFix(UnitTestFeedbackChain,True,globals())
# judgeGenerator = DecisionMaker(judgeChain,globals())
# bugFixGenerator = BugFix(bugFixChain,True,globals())
# bugFixGeneratorben = BugFixBench(bugFixChainben,True,globals())


False




## Generate Unit Tests from Code

In [4]:
testGenerator.generate(110, 164)

Running Test Case  110


Test example 110 failed

Number of Ran Tests :  7
Number of failed Tests :  1
Number of error Tests :  0
Number of Succeeded Test :  6
Running Test Case  111
Test example 111 succeeded

Number of Ran Tests :  6
Number of Succeeded Test :  6
Running Test Case  112
Test example 112 failed

Number of Ran Tests :  5
Number of failed Tests :  2
Number of error Tests :  0
Number of Succeeded Test :  3
Running Test Case  113
Test example 113 failed

Number of Ran Tests :  4
Number of failed Tests :  2
Number of error Tests :  0
Number of Succeeded Test :  2
Running Test Case  114
Test example 114 failed

Number of Ran Tests :  6
Number of failed Tests :  2
Number of error Tests :  1
Number of Succeeded Test :  3
Running Test Case  115
Test example 115 succeeded

Number of Ran Tests :  3
Number of Succeeded Test :  3
Running Test Case  116
Test example 116 failed

Number of Ran Tests :  5
Number of failed Tests :  3
Number of error Tests :  0
Number of Succeeded Test :  2
Running Test Case  11

### Test Fix

In [2]:
testRegenerator = TestFix(UnitTestFeedbackChain,True,globals())

In [3]:
# judgeGenerator = DecisionMaker(judgeChain,globals())
testRegenerator.generate(30)

********************************************************************
Running Example  30 

Example 30  has already passed
********************************************************************
Running Example  31 

Test example 31 succeeded

Number of Ran Tests :  10
Number of Succeeded Test :  10
Number of Succeeded Test :  0
********************************************************************
Running Example  32 

Test example 32 failed

Number of Ran Tests :  6
Number of failed Tests :  1
Number of error Tests :  0
Number of Succeeded Test :  5
********************************************************************
Running Example  33 

Example 33  has already passed
********************************************************************
Running Example  34 

Test example 34 failed

Number of Ran Tests :  6
Number of failed Tests :  0
Number of error Tests :  1
Number of Succeeded Test :  5
********************************************************************
Running Example  35 

Test examp

## Judge

In [2]:
judgeGenerator = DecisionMaker(judgeChain,globals())
judgeGenerator.generate()

Directory JudgeOutput/ created successfully!
File JudgeOutput/JudgmentLogs.json created successfully!
Running Test Case  0
Example 0  has already passed
Running Test Case  1
generatedJudgement:  <s> [INST] Given the Python code below, its description, and an error-producing test case with the associated error message, determine the cause of the error. Use the information provided to decide whether the error is due to a bug in the Python code itself or in the test case. Provide a reasoned conclusion.

- Code Under Test:



def is_prime(n):
    if n < 2:
        return False
    for k in range(2, n - 1):
        if n % k == 0:
            return False
    return True

- Description of Functionality:
    Return true if a given number is prime, and false otherwise.
    >>> is_prime(6)
    False
    >>> is_prime(101)
    True
    >>> is_prime(11)
    True
    >>> is_prime(13441)
    True
    >>> is_prime(61)
    True
    >>> is_prime(4)
    False
    >>> is_prime(1)
    False

- Error-Induc

TypeError: argument of type 'NoneType' is not iterable

## BugFix

In [None]:
bugFixGenerator.generate()

True
File BugFixOutput/Cases.json created successfully!
File BugFixOutput/RunningLogs.json created successfully!
Running Example  0 

Test Case 0 Didn't Run Due to Incomplete Response

Test example 0 failed

Number of Ran Tests :  3
Number of failed Tests :  0
Number of error Tests :  0
Number of Succeeded Test :  3
Running Example  1 

Test Case 1 Didn't Run Due to Incomplete Response

Test example 1 failed

Number of Ran Tests :  4
Number of failed Tests :  0
Number of error Tests :  0
Number of Succeeded Test :  4
Running Example  2 

Example 2  has already passed
Running Example  3 

Example 3  has already passed
Running Example  4 

Test Case 4 Didn't Run Due to Incomplete Response

Test example 4 failed

Number of Ran Tests :  7
Number of failed Tests :  0
Number of error Tests :  0
Number of Succeeded Test :  7
Running Example  5 

Test Case 5 Didn't Run Due to Incomplete Response

Test example 5 failed

Number of Ran Tests :  7
Number of failed Tests :  0
Number of error Tests 

## BugFixBen

In [3]:
bugFixGeneratorben.generate()

Running Example  0 

Test example 0 failed due to syntax or indentation or timeout

Number of Ran Tests :  0
Number of failed Tests :  0
Number of error Tests :  1
Number of Succeeded Test :  0
Running Example  1 

Test example 1 succeeded

Number of Ran Tests :  7
Number of Succeeded Test :  7
Number of Succeeded Test :  0
Running Example  2 

Test example 2 succeeded

Number of Ran Tests :  7
Number of Succeeded Test :  7
Number of Succeeded Test :  0
Running Example  3 

Test example 3 succeeded

Number of Ran Tests :  7
Number of Succeeded Test :  7
Number of Succeeded Test :  0
Running Example  4 

Test example 4 failed

Number of Ran Tests :  7
Number of failed Tests :  0
Number of error Tests :  6
Number of Succeeded Test :  1
Running Example  5 

Test example 5 failed

Number of Ran Tests :  6
Number of failed Tests :  0
Number of error Tests :  4
Number of Succeeded Test :  2
Running Example  6 

Test example 6 succeeded

Number of Ran Tests :  11
Number of Succeeded Test :  1

In [None]:
import openai
# from langchain_openai import OpenAI
from langchain.prompts import BaseChatPromptTemplate, ChatPromptTemplate
from langchain import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

prompt = PromptTemplate(
            template="How to say {input} in {output_language}:\n",
            input_variables=["input", "output_language"],
            verbose=False)

# llm = ChatPromptTemplate(
llm = ChatOpenAI(temperature=0)

# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)

llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo")

chain = prompt | llm

chain.invoke(
    {
        "output_language": "German",
        "input": "I love programming.",
    }
)