In [6]:
import re # regular expression string search
import tiktoken # open ai tokenizer
import openai
import ast # create json object from literal
import pandas as pd 
from scipy.stats import t as t_test
import nltk
import numpy as np
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential
)

In [3]:
class dictionary(dict):
    def __init__(self):
        self = dict()
    def add(self, key, value):
            self[key] = value

def count_tokens(Text,MODEL):
    encoding = tiktoken.encoding_for_model(MODEL)
    tokens = encoding.encode(Text)
    return len(tokens)

def return_tokens(Text,MODEL):
    encoding = tiktoken.encoding_for_model(MODEL)
    tokens = encoding.encode(Text)
    return tokens

def read_and_clean(file):
    
    with open(file) as f:
        lines = f.readlines()

    rawCode = [i.lower() for i in lines if (i != "\n" and not i.startswith("//"))]
    result_string = ''.join(rawCode)
    
        #disregard multiline commented sections - unfeasible to not split into chunks that divide commented sections
    for pattern in re.findall(r'/\*(.*?)\*/',result_string,flags = re.DOTALL):
        pattern = r"/*"+pattern+r"*/"
        result_string = re.sub(re.escape(pattern), '', result_string)
    
    return result_string

def read_file(filename):
    with open (filename) as file:
        rfile = file.read()
    return rfile

def make_prompts(Code_files):
    # takes multiple files in list

    sized_prompts = [""]
    index = 0
    for file in Code_files:

        runningChunkTotal = 0
        line_counter = 0
        
        clean_code_sections = read_and_clean(file).split(";")
        splits_to_avoid = split_index(file)
        
        for chunk in clean_code_sections:
            
            t = count_tokens(chunk, MODEL)
            runningChunkTotal += t
            
            if runningChunkTotal > PROMPT_LENGTH and not line_counter - 1 in splits_to_avoid:
                runningChunkTotal = t 
                index += 1
                sized_prompts.append("")
                # print(runningChunkTotal)
                sized_prompts[index] += (chunk + "\n")
            else:
                # print(runningChunkTotal,"HERE")
                sized_prompts[index] += (chunk + "\n")
                
            line_counter += 1
        

    return sized_prompts

@retry(wait=wait_random_exponential(min=1,max=60),stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [10]:
PROMPT_LENGTH = 1200

params = dictionary()
for line in read_file("GPT_AUTH.txt").split("\n"):
    sep = line.split(" ")
    params.add(sep[0],sep[1])

AZURE_MODEL_NAME = params["AZURE_MODEL_NAME"] # name of our deployment at oai.azure.com
MODEL = params["MODEL"] # actual name of the gpt model
openai.api_base = params["OPENAI_API_BASE"]
openai.api_key = params["OPENAI_API_KEY"]
openai.api_version = params["OPENAI_API_VERSION"]
openai.api_type = params["API_TYPE"]

In [27]:
def split_index(file):
    # takes single file as string
    
    clean_code_sections = read_and_clean(file).split("\n")
    
    # if for 
    tuples = [(n,i) for n,i in enumerate(clean_code_sections) if i.strip().startswith("next") or (i.strip().startswith("for") and " = " in i)]

    starters = ["for"]
    closers = ["next"]

#     QV_exceptions = []

    algo_count = 0
    prev_i = []
    indicies = []
    for i,j in tuples:
        start = re.findall('|'.join(starters),j)
        close = re.findall('|'.join(closers),j)
        if any(start) and any(close):
            continue
        elif any(start):
            if algo_count == 0:
                key = i
                # print(i,j)
            if any(prev_i):
                # print(prev_i,start)
                algo_count += 1
            else:
                algo_count+=1
            if i > 0:
                prev_i = start
            # print(algo_count,i,j)
        elif any(close):
            algo_count -= 1
            if algo_count == 0:
                # print(i,j)
                for k in range(key,i):
                    indicies.append(k)
            if i > 0:
                prev_i = start
            # print(algo_count,i,j)
            
    return indicies

In [None]:
Code_files = ["BradySales_DATA.txt"]

# example_input_SQL = read_file("Examples/SqlClassCompressed.txt")
# accepted_output_SQL = read_file("Examples/SqlClassExample.txt")

t = make_prompts(Code_files)

In [None]:
Chunk = t[1]

In [None]:
response_2 = completion_with_backoff(
    engine = AZURE_MODEL_NAME,
    messages = [
            {"role": "system", "content" : "You are giving step by step low level detail of Qlikview codes so that developers will not have to reference the source code after reading your response."},
            {"role": "user", "content": f"Be extremely specific about any conditions that are in the code. \n code:\n```\n {example_code} \n```\n"},
            {"role": "assistant", "content" : f"{example_translation}"},
            {"role": "user", "content": f"Be extremely specific about any conditions that are in the code.  \n code:\n```\n {Chunk} \n```\n"}
                ], 

    # max_tokens = 2000
    max_tokens = 16000 - (count_tokens(Chunk, MODEL)  + count_tokens(example_code,MODEL) + count_tokens(example_translation,MODEL)),
    temperature = 1.0, #introducing a higher degree of randomness into the output, used with the n parameter to aid in getting consistently good outputs for us
    logit_bias = logit_bias,
    n = 3
)

In [None]:
response_3 = completion_with_backoff(
    engine = AZURE_MODEL_NAME,
    messages = [
            {"role": "user", "content": f"Use the following summary of SAS code to generate a script of SAS code WITHOUT COMMENTS IN IT. Return only the code. For reference, this code does the following:{(', ').join(example_columns)}. \n summary:\n```\n {summary} \n```\n"},
                ],
    max_tokens = PROMPT_LENGTH+300,
    temperature = 0,
    n = 1
)

In [None]:
reference = nltk.wordpunct_tokenize(read_file("Candidate.txt"))
candidate = nltk.wordpunct_tokenize(response_3["choices"][0]["message"]["content"].lower())
score = sentence_bleu([reference], candidate)
score

In [None]:
# grab the chosen summary and write to a file
# update an ongoing summary up to current chunk (make sure num tokens in this plus num tokens in prompt < 16k)
# in messages, pass the summaries of the most recent chunks
# clean out the comments in the SAS file
# 

# reduce n in the summarization

# whats relationship between number n needed for good summary and quality of examples used in the prompts
        # thoughts - if we have an app that gets used by devs we could collect ground truth over time
        # parameterized n for people to change?

In [None]:
encoding = tiktoken.encoding_for_model(MODEL)
encoding.decode_single_token_bytes(52340)