In [1]:
import vertexai # to interact with googles code chatbot ai
from vertexai.preview.language_models import CodeChatModel, ChatModel, InputOutputTextPair, ChatMessage,CodeGenerationModel, TextGenerationModel
import google
import openai
import openai.error


import syfco # to convert between different form
import bosy
from bosy import BosyError
import strix

import benchmark
from benchmark import Benchmark, build_prompt

import verify # used to verify verilog solutions
import json
import os

# both for handling async stuff
import asyncio

from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

import csv # to write the benchmark results to a csv file
from utils import *
from prompting import *
import time

In [2]:
def generate_module_signature(bm: Benchmark, params=None, module_name="fsm"):
    if params == None:
        params = bm.generate_params
    spec = read_file(bm.specification)
    inputs = syfco.inputs(spec, overwrite_params=params)
    outputs = syfco.outputs(spec, overwrite_params=params)
    return f"""module {module_name} ({
    ", ".join(
      ["input " + inp for inp in inputs] + ["output reg " + out for out in outputs]
    )
  })"""


Used packages:
google-cloud-aiplatform
docker

In [3]:
vertexai.init(project="rg-finkbeiner-30141001", location="us-central1")
chat_model = ChatModel.from_pretrained("chat-bison")
code_model = CodeChatModel.from_pretrained("codechat-bison")
parameters = {
    "temperature": 0.5,
    "max_output_tokens": 1024
}
openai.api_key = os.getenv("OPENAI_KEY")
def openai_complete(messages, model = "gpt-3.5-turbo-16k", retry_n = 10, **kwargs):
    count = 0
    while count < retry_n:
        try:
            completion = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            **kwargs
            )
            return completion
        except (openai.error.RateLimitError,
                openai.error.APIError,
                openai.error.APIConnectionError,
                openai.error.ServiceUnavailableError,
                openai.error.Timeout,
                openai.error.TryAgain): # OpenAI API seems to be quite unstable
            time.sleep(60)
            count = count + 1
    raise openai.error.RateLimitError

In [4]:
benchmarks = [
    Benchmark(bm, "../../verilog/") for bm in json.loads(read_file("benchmarks.json"))
]
benchmarks_strix = [
    Benchmark(bm, "") for bm in json.loads(read_file("benchmarks_strix.json"))
]
benchmarks_bosy = [
    Benchmark(bm, "") for bm in json.loads(read_file("benchmarks_bosy.json"))
]

In [7]:
def find_best_openai_old(bm : Benchmark, mode = "self", template=PromptOpenAI):
    prompt_builder = build_prompt(bm=bm, mode=mode, template=template, return_raw=True)
    spec = read_file(bm.specification)
    
    maxval = 0
    def complete(messages):
      count = 0
      while True:
        try:
          completion = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-16k",
            messages=messages,
            temperature=0.5,
            stop = "endmodule",
            presence_penalty = -0.5,
            n = 3 # best of 3
          )
          break
        except (openai.error.RateLimitError,
                openai.error.APIError,
                openai.error.APIConnectionError,
                openai.error.ServiceUnavailableError,
                openai.error.Timeout,
                openai.error.TryAgain): # OpenAI API seems to be quite unstable
          print("trying again " + str(count))
          time.sleep(60)
          count = count + 1
          if count > 5:
            print(bm)
            print("wtf")
            return "AI_RATELIMIT"
      
      for choice in completion.choices:
        response = choice.message.content
        code = extract_normalized_verilog_code(response, bm.name)
        if code == None:
          continue
        res = verify.verify_code(bm.specification, code, overwrite_params=bm.generate_params)
        if res ==  verify.ReturnCode.SUCCESS:
          return code
      raise Exception
    
    params = bm.generate_params
    while True:
      try:
        PARAMS = join_params(params)
        SPEC = syfco.convert(spec, "ltl", overwrite_params=params)
        code = complete(prompt_builder.build_prompt({
          "PARAMS": PARAMS,
          "SPEC": SPEC
        }))
        maxval = next(iter(params.values())) # we only ever use a single parameter, this extracts the current value
        key = next(iter(params.keys()))
        params = {key: maxval * 2}
        prompt_builder.add_example(replacements = {
          "IMPL": code,
          "PARAMS": PARAMS,
          "SPEC": SPEC
        })
      except:
        return maxval

def improve_iteratively(response, bm):
  code = extract_normalized_verilog_code(response, bm.name)
  res = "NO_CODE" if code == None else bm.verify(impl=code).name
  match res:
    case "NO_CODE":
      prompt = "This doesn't seem like a complete verilog module. Could you please give me the entire module?"
    case "ERROR_CONVERT_TO_VERILOG":
      prompt = "This code isn't syntactically valid verilog code. Could you please fix the syntax errors?"
    case "ERROR_COMBINE_AIGER":
      prompt = f"Could you please make sure to use the module signature `{generate_module_signature(bm)}` in your code. Please also try to think about your code again using the new signature. Make sure it matches the specification!"
    case "FALSE_RESULT":
      prompt = "The code doesn't satisfy the specification. Please think extensively about how you need to change your code to satisfy the specification. If necessary, rewrite the module from ground up"
    case "SUCCESS":
      return (code, res)
    case _:
      return (None, res)
  return (prompt, res)


def run_single_iterative(bm, type):
  # bm.build_prompt might do some heavy work like synthesizing examples with bosy
  prompt, examples = build_prompt(bm, params=bm.generate_params, template=PromptPALM, mode=type)
  chat = chat_model.start_chat(examples=examples)
  response = chat.send_message(prompt, **parameters)
  for i in range(0, 3):
    (prompt, res) = improve_iteratively(response.text, bm)
    if res == "SUCCESS" or res == None:
      return res
    response = chat.send_message(prompt, **parameters)
  return res

def run_single_explicit_examples(bm, type):
    print("Starting benchmark " + bm.name + "/" + type)
    # bm.build_prompt might do some heavy work like synthesizing examples with bosy
    prompt, examples = build_prompt(bm, params=bm.generate_params, template=NoSpecificationPromptPALM, mode=type)
    print("Finished generating prompt for " + bm.name + "/" + type)
    chat = chat_model.start_chat(examples=examples)
    response = chat.send_message(prompt, **parameters)
    code = extract_normalized_verilog_code(response.text, bm.name)
    # print(code if code else "NO_CODE::\n" + response.text)
    if code == None:
      return "NO_CODE"
    else:
      res = bm.verify(impl=code)
      return res.name

def run_single_best_k(k):
  def run_single (bm, type):
    # bm.build_prompt might do some heavy work like synthesizing examples with bosy
    prompt, examples = build_prompt(bm, params=bm.generate_params, template=PromptPALM, mode=type)
    best = "NO_CODE"
    for i in range(0, k):
      chat = chat_model.start_chat(examples=examples)
      response = chat.send_message(prompt, **parameters)
      code = extract_normalized_verilog_code(response.text, bm.name)
      if code == None:
        continue

      res = bm.verify(impl=code)
      if res ==  verify.ReturnCode.SUCCESS:
        return res.name
      elif res == verify.ReturnCode.FALSE_RESULT:
        best = res.name
      elif best != "FALSE_RESULT":
        best = res.name
    return best
  return run_single

def run_single_template(template=DefaultPromptTemplate):
  def run_single(bm, type):
    prompt = build_prompt(bm, params=bm.generate_params, template=template, mode=type)
    chat = chat_model.start_chat()
    response = chat.send_message(prompt, **parameters)
    code = extract_normalized_verilog_code(response.text, bm.name)
    if code == None:
      print(response.text)
      return "NO_CODE"
    else:
      res = bm.verify(impl=code)
      return res.name
  return run_single

def run_single_codechat(bm, type, template=DefaultPromptTemplate):
  print("Starting benchmark " + bm.name + "/" + type)
  # bm.build_prompt might do some heavy work like synthesizing examples with bosy
  prompt = build_prompt(bm, params=bm.generate_params, template=template, mode=type)
  chat = code_model.start_chat()
  response = chat.send_message(prompt, **parameters)
  code = extract_normalized_verilog_code(response.text, bm.name)
  if code == None:
    return "NO_CODE"
  else:
    res = bm.verify(impl=code)
    return res.name
    
def run_single_openai(bm, type):
  print("Starting benchmark " + bm.name + "/" + type)
  messages = build_prompt(bm, params=bm.generate_params, template=NoSpecificationPromptOpenAI, mode=type)
  print(messages)
  
  try:
    completion = openai_complete(messages=messages)
  except openai.error.RateLimitError:
    return "AI_RATELIMIT"
  
  response = completion.choices[0].message.content
  print(response)
  code = extract_normalized_verilog_code(response, bm.name)
  if code == None:
    return "NO_CODE"
  else:
    res = bm.verify(impl=code, overwrite_params=bm.generate_params)
    return res.name

def run_single_openai_best_k(k):
  def run_single (bm, type):
    print("Starting benchmark " + bm.name + "/" + type)
    messages = build_prompt(bm, params=bm.generate_params, template=PromptOpenAI, mode=type)
    
    try:
      completion = openai_complete(messages=messages, n = k)
    except openai.error.RateLimitError:
      return "AI_RATELIMIT"
    
    best = "NO_CODE"
    print(completion)
    for choice in completion.choices:
      response = choice.message.content
      code = extract_normalized_verilog_code(response, bm.name)
      if code == None:
        continue
      res = bm.verify(impl=code)
      if res ==  verify.ReturnCode.SUCCESS:
        return res.name
      elif res == verify.ReturnCode.FALSE_RESULT:
        best = res.name
      elif best != "FALSE_RESULT":
        best = res.name
    return best
  return run_single

def find_best_openai(bm : Benchmark, type : str):
    INCLUDE_SPECS = True
    # prompt_builder = build_prompt(bm, mode=type,template=template,return_raw=True)

    parname, curval = list(bm.generate_params.items())[0]
    
    messages = build_prompt(bm, mode=type,template=PromptOpenAI)
    print("Starting benchmark: " + bm.name)

    bestval = 0
    params = {parname : curval}
    while bestval < 1000:
      try:
        print("Current val = " + str(curval))
        completion = openai_complete(messages=messages, n=5)

        res = -1
        for choice in completion.choices:
          response = choice.message.content
          code = extract_normalized_verilog_code(response, bm.name)
          #print(code)
          if code == None:
            continue
          res = bm.verify(impl=code, overwrite_params=params)
          if res ==  verify.ReturnCode.SUCCESS:
            break
        
        if res !=  verify.ReturnCode.SUCCESS: # we got wrong code :(
          return str(bestval)
        bestval = curval
        curval = curval * 2

        params = {parname : curval}
        messages.append(choice.message.to_dict())
        messages.append(
          build_prompt(bm, mode=type,template=PromptOpenAI if INCLUDE_SPECS else PromptOpenAINoSpecification, params=params)[-1]
        )


      except openai.error.RateLimitError: # OpenAI API seems to be quite unstable
        return "AI_RATELIMIT"
      except (openai.InvalidRequestError):
        messages = messages[0:1] + messages[2:1] # remove oldest message while keeping system instruction
    if code:
      print(code)
    return str(bestval)


In [9]:
async def run_benchmarks(benchmarks, file, example_types = ["self", "bosy", "strix", "none"], run_single=run_single_openai):
  # setting up csv writing
  f = open(file, 'w', newline='')
  csvwriter = csv.DictWriter(f, fieldnames=["benchmark"] + example_types, dialect='unix', quoting=csv.QUOTE_NONE)
  csvwriter.writeheader()

  # get event loop to be able to run the requests in parallel
  loop = asyncio.get_event_loop()

  async def _run_single(bm, type):
    try:
      return await loop.run_in_executor(None, run_single, bm, type)
    except TimeoutError:
      return "TIMEOUT"
    except BosyError:
      return "BOSY_ERROR"
    except (google.api_core.exceptions.InternalServerError,
          openai.InvalidRequestError):
      return "AI_ERROR" 
    #except Exception as e:
    #  print(e, e.__class__)
  async def run_single_benchmark(bm, i = 0):
    result = {
      "benchmark": bm.name
    }
    print(i)
    await asyncio.sleep(300 * i)  # needed because of quota
    async_res = await asyncio.gather(
      *[_run_single(bm, type) for type in example_types]
    )
    for i, res in enumerate(async_res):
      result[example_types[i]] = res
    csvwriter.writerow(result)

  await asyncio.gather(
    *[run_single_benchmark(bm, i) for i, bm in enumerate(benchmarks)]
  )
  f.close()

await run_benchmarks(benchmarks=benchmarks, run_single=find_best_openai, example_types=["self", "bosy", "strix", "none"],file = "results/human_highest_3.csv")

0
1
2
3
4
Starting benchmark: detector
Current val = 8
Starting benchmark: detector
Current val = 8
Starting benchmark: detector
Current val = 8
Starting benchmark: detector
Current val = 8
Current val = 16
Starting benchmark: mux
Current val = 16
Starting benchmark: mux
Current val = 16
Starting benchmark: mux
Current val = 16
Starting benchmark: mux
Current val = 16
Current val = 32
Current val = 64
Current val = 128
Current val = 256
Current val = 256
Current val = 32
Current val = 64
Current val = 128
Current val = 256
Current val = 256
Starting benchmark: full_arbiter
Current val = 4
Starting benchmark: full_arbiter
Current val = 4
Starting benchmark: full_arbiter
Current val = 4
Starting benchmark: full_arbiter
Current val = 4
Current val = 8
Starting benchmark: simple_arbiter
Current val = 8
Starting benchmark: simple_arbiter
Current val = 8
Starting benchmark: simple_arbiter
Current val = 8
Starting benchmark: simple_arbiter
Current val = 8
Current val = 16
Starting benchmark: 

In [9]:

dot = bosy.synthesize(read_file("detector.tlsf"), target="dot", overwrite_params={"n":2})
print(dot)
ltl = syfco.convert(read_file("detector.tlsf"), format="ltl", overwrite_params={"n": 4})
print(ltl)

digraph graphname {
	_init [style="invis"];
	_init -> s0[label=""];
	s0[shape=rectangle,label="s0"];
	s1[shape=rectangle,label="s1"];
	s0 -> s0 [label="(¬(r_1 ∧ ¬r_0) ∧ r_1 ∧ r_0) / g"];
	s0 -> s0 [label="(¬(r_1 ∧ ¬r_0) ∧ ¬(r_1 ∧ r_0)) / "];
	s0 -> s1 [label="(r_1 ∧ ¬r_0 ∧ ¬(r_1 ∧ r_0)) / "];
	s1 -> s0 [label="(¬r_1 ∧ r_0) / g"];
	s1 -> s1 [label="(¬(¬r_1 ∧ r_0) ∧ r_0) / g"];
	s1 -> s1 [label="(¬(¬r_1 ∧ r_0) ∧ ¬r_0) / "];
}

G (F r_0) && G (F r_1) && G (F r_2) && G (F r_3) <-> G (F g)


In [None]:

contents = read_file("detector.tlsf")
ltl = syfco.convert(contents, "ltl", overwrite_params={"n": 4})
bosy_f = syfco.convert(contents, "bosy", overwrite_params={"n": 4})
bosy_impl = bosy.synthesize(bosy_f)

template = PromptTemplate()
template.add_example({
    "SPEC": syfco.convert(contents, "ltl", overwrite_params={"n": 2}),
    "IMPL": read_file("../../verilog/detector/detector_2.vl"),
    "PARAMS": "n=2"
})
prompt = template.build_prompt({"SPEC": syfco.convert(contents, "ltl", overwrite_params={"n": 4})})
print(prompt)

You are an expert in writing correct verilog code, that fulfill certain formal properties specified in LTL.
Here is an example for n=2. It satisfies the LTL specification G (F r_0) && G (F r_1) <-> G (F g):
module detector(
  input [1:0] r,
  input clk,
  output reg g
);
  reg [1:0] state;
  initial state = '0;
  always @(posedge clk) begin
    state = state | r;
    g = 0;
    if(state == '1) begin
      g = 1;
      state = '0;
    end
  end
endmodule

Please write a Verilog module fulfilling the following expectations. Make sure the code is fully synthesizable.:
G (F r_0) && G (F r_1) && G (F r_2) && G (F r_3) <-> G (F g)
