In [165]:
import json
import openai
import time
import logging
from datetime import datetime
import os
import tiktoken
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge

In [166]:
api_key = "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"
os.environ['OPENAI_API_KEY'] = api_key

In [167]:
class PromptTemplate:
    def __init__(self, instructions, examples, query, tools_file='tools.json'):
        self.instructions = instructions
        self.tools_file = tools_file
        self.tools = self._load_tools()
        self.examples = examples
        self.query = query
        self.system_message = self._make_system_message()

    def _load_tools(self):
        try:
            with open(self.tools_file, 'r') as f:
                return json.load(f)
        except (FileNotFoundError, json.JSONDecodeError):
            return {}
        
    def _make_system_message(self):
        system_message = f'''{self.instructions}
        
        Tools (in JSON format):
        {self.tools}
        
        Given input as a query, generate output as a list of the sub-questions and answers at each step. Also output a JSON as shown in examples.
        Do not mention the sub-questions in the response. Carefully note the order in which arguments need to be given inside tools.
        Example 1:
        {self.examples[0]}
        
        Example 2 (when a tool requires output of a previous tool):
        {self.examples[1]}
        '''
        return system_message

    def get(self):
        return {
            "system_message": self.system_message,
            "user_message": self.query
        }

In [168]:
class GPT4LLMHandler:
    def __init__(self, api_key=os.environ['OPENAI_API_KEY'], model="gpt-4o-2024-08-06", temperature=1, max_tokens=5000):
        openai.api_key = api_key
        self.client = openai.OpenAI()
        self.model = model
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.price_per_1k_tokens = 0.0025

    def _count_tokens(self, text):
        encoding = tiktoken.get_encoding('o200k_base')
        num_tokens = len(encoding.encode(text))
        return num_tokens

    def generate_response(self, system_message, user_message):
        try:
            start_time = time.time()
            input_tokens = self._count_tokens(system_message) + self._count_tokens(user_message)

            response = self.client.chat.completions.create(
                        model=self.model,
                        messages=[
                            {
                            "role": "system",
                            "content": [
                                {
                                "type": "text",
                                "text": system_message
                                }
                            ]
                            },
                            {
                            "role": "user",
                            "content": [
                                {
                                "type": "text",
                                "text": user_message
                                }
                            ]
                            }
                        ],
                        temperature=self.temperature,
                        max_tokens=self.max_tokens,
                        top_p=1,
                        frequency_penalty=0,
                        presence_penalty=0,
                        response_format={
                            "type": "json_object"
                        }
                        )

            end_time = time.time()

            output = response.choices[0].message.content
            output_tokens = self._count_tokens(output)
            total_tokens = input_tokens + output_tokens
            total_cost = (total_tokens / 1000) * self.price_per_1k_tokens

            self.log(user_message, output, input_tokens, output_tokens, total_cost, start_time, end_time)

            return {
                "full": response,
                "response": output,
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "total_tokens": total_tokens,
                "total_cost": total_cost,
                "response_time": end_time - start_time,
                "request_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
        except:
            return {
                "full": response,
                "response": output,
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "total_tokens": total_tokens,
                "total_cost": total_cost,
                "response_time": end_time - start_time,
                "request_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }


In [169]:
instructions = '''You are a query solver. You will be given tools. Using those tools, you have to solve the query. 
To solve the query, at each point, you have to ask sub-questions. These sub-questions are "What is the next tool to use, its arguments and argument values?". Look at answers to the previous sub-questions, which will give you context of how the current set of tools have been chosen so far. Compare this to the greater context, which is, how to solve the next question.

Some important points :
1) Tool description and argument description are very important. Read them to understand what exactly a certain tool generates as output or what inputs a tool can get.
2) Output of tools in the previous step is input to tool for current statement. Compare descriptions, types and examples to get a huge clue.
3) Always check if authentication tools like "who_am_i", "team_id", "get_sprint_id", etc. are needed at any point.
4) Take care of "type" argument in "works_list" is issue, ticket or task are explicitly mentioned.
5) You can access the output of the ith task using "$$PREV[i]". i starts from 0.
6) Stop once you feel the task is complete and no further tools are needed to solve the query.
7) To answer the query, you are only allowed to use the tools that we have provided.
8) If the question is simply unsolvable using any tool we have, only return {"tools: []}'''

In [170]:
examples = [{
    "tools":
    [
        {
            "tool_name": "works_list",
            "arguments": [
                {
                    "argument_name": "ticket_severity",
                    "argument_value": [
                        "blocker"
                    ]
                },
                {
                    "argument_name": "type",
                    "argument_value": [
                        "ticket"
                    ]
                }
            ]
        }
    ]
},
        {
    "tools":
        [
            {
                "tool_name": "search_object_by_name",
                "arguments": [
                    {
                        "argument_name": "query",
                        "argument_value": "Globex"
                    }
                ]
            },
            {
                "tool_name": "works_list",
                "arguments": [
                    {
                        "argument_name": "created_by",
                        "argument_value": "$$PREV[0]"
                    },
                    {
                        "argument_name": "ticket_severity",
                        "argument_value": [
                            "high"
                        ]
                    },
                    {
                        "argument_name": "type",
                        "argument_value": [
                            "ticket"
                            ]
                    }
                ]
            }
        ]
}
]

In [171]:
llm = GPT4LLMHandler()

In [172]:
def evaluate_bleu_rouge(llm_response, ground_truth):
    reference = [ground_truth.split()]
    candidate = llm_response.split()
    bleu_score = sentence_bleu(reference, candidate)

    rouge = Rouge()
    rouge_scores = rouge.get_scores(llm_response, ground_truth, avg=True)

    return bleu_score, rouge_scores

In [173]:
with open('data.json', 'r') as file:
    data = json.load(file)

In [174]:
scores = []
for i in data:
    prompt = PromptTemplate(instructions, examples, i['query'])
    response = llm.generate_response(prompt.get()['system_message'], prompt.get()['user_message'])
    llm_value = json.dumps(json.loads(response['response'])['tools'], sort_keys=False)
    ground_truth = f'{i['solution']}'.replace("'", '"').replace('True', 'true')
    bleu, rouge = evaluate_bleu_rouge(llm_value, ground_truth)
    score = [llm_value, ground_truth, bleu, rouge]
    scores.append(score)

In [178]:
#bleu score
sum = 0
for i in scores:
    sum+=i[2]
bleu = sum/20

In [179]:
rl =  0
for j in scores:
    rl+=j[3]['rouge-l']['f']
rl/=20

In [180]:
print('Bleu score:', bleu)
print('Rouge-L F1 score:', rl)

Bleu score: 0.7566116249817973
Rouge-L F1 score: 0.9406528772091137
