In [1]:
import argparse
import pprint
import os
import re
from tqdm import tqdm
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from human_eval.data import write_jsonl, read_problems, stream_jsonl

In [2]:
def extract_text(prompt, remove_lines=True):
    token = '\"\"\"'
    start = token
    end = '>>>'

    start_idx = prompt.find(start) + len(start)
    end_idx = prompt.find(end)

    output = prompt[start_idx: end_idx]
    if remove_lines:
        output = output.replace('\n', ' ')
    output = re.sub(r"\s+", " ", output).strip()

    return output


INSTRUCTION = """Below is an instruction that describes a task. Write a response that appropriately completes the request.


### Instruction:
Create a Python script for this problem:
{}

### Response:"""

In [4]:
parser = argparse.ArgumentParser()

parser.add_argument('--model', type=str, default='Salesforce/codet5p-2b', help="")
parser.add_argument('--output_path', type=str, help="")
parser.add_argument('--start_index', type=int, default=0, help="")
parser.add_argument('--end_index', type=int, default=164, help="")
parser.add_argument('--temperature', type=float, default=0.2, help="")
parser.add_argument('--N', type=int, default=200, help="")
parser.add_argument('--max_len', type=int, default=800, help="")
parser.add_argument('--decoding_style', type=str, default='sampling', help="")
parser.add_argument('--num_seqs_per_iter', type=int, default=4, help='')
parser.add_argument('--overwrite', action='store_true', help='')

args,_ = parser.parse_known_args()

argsdict = vars(args)
print(pprint.pformat(argsdict))

STOP_SEQS = ['\nclass', '\ndef', '\n#', '\nif', '\nprint']
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

problems = read_problems()

task_ids = sorted(problems.keys())[args.start_index: args.end_index]
prompts = [problems[task_id]['prompt'] for task_id in task_ids]
num_samples = len(prompts)
print("Number of samples: {}".format(num_samples))

tokenizer = AutoTokenizer.from_pretrained(args.model)

model = AutoModelForSeq2SeqLM.from_pretrained(args.model,
                                                trust_remote_code=True,  # False for 220m and 770m models
                                                torch_dtype=torch.float16,
                                                low_cpu_mem_usage=True
                                                )
model.eval()
model.to(device)

# for larger LLMs such as 2B, 6B, and 16B, we need to pass the text prompt to the decoder
prompt_to_decoder = True if any([size in args.model for size in ['2b', '6b', '16b']]) else False

{'N': 200,
 'decoding_style': 'sampling',
 'end_index': 164,
 'max_len': 800,
 'model': 'Salesforce/codet5p-2b',
 'num_seqs_per_iter': 4,
 'output_path': None,
 'overwrite': False,
 'start_index': 0,
 'temperature': 0.2}
Number of samples: 164


In [1]:
prompt = """def fizz_buzz(n: int):\n    \"\"\">>> fizz_buzz(50)\n    0\n    >>> fizz_buzz(78)\n    2\n    >>> fizz_buzz(79)\n    3\n    \"\"\""""

In [2]:
prompt_batch = [prompt]
encoding = tokenizer(prompt_batch, return_tensors="pt", truncation=True, max_length=args.max_len).to(device)
prompt_batch_decoder = [prompt]
encoding_decoder = tokenizer(prompt_batch_decoder, return_tensors="pt", truncation=True,
                                     max_length=args.max_len).to(device)

NameError: name 'tokenizer' is not defined

In [18]:
gen_tokens = model.generate(**encoding,
                            decoder_input_ids=encoding_decoder['input_ids'],
                            do_sample=True,
                            temperature=0.2,
                            max_length=args.max_len,
                            num_return_sequences=2,
                            decoder_start_token_id=tokenizer.pad_token_id,
                            eos_token_id=tokenizer.eos_token_id,
                            top_p=0.95)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [19]:
gen_seqs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

In [20]:
for i in range(2):
    print(gen_seqs[i])

def fizz_buzz(n: int):
    """return the number of times the number 7 appears in integers less than n which are divisible by 11 or 13
    >>> fizz_buzz(50)
    0
    >>> fizz_buzz(78)
    2
    >>> fizz_buzz(79)
    3
    """
    count = 0
    for i in range(n):
        if i % 11 == 0 or i % 13 == 0:
            count += 1
    return count


if __name__ == "__main__":
    import doctest

    doctest.testmod()
    print(fizz_buzz(50))
    print(fizz_buzz(78))
    print(fizz_buzz(79))
    print(fizz_buzz(100))
    print(fizz_buzz(101))
    print(fizz_buzz(200))
    print(fizz_buzz(201))
    print(fizz_buzz(300))
    print(fizz_buzz(301))
    print(fizz_buzz(400))
    print(fizz_buzz(401))
    print(fizz_buzz(500))
    print(fizz_buzz(501))
    print(fizz_buzz(600))
    print(fizz_buzz(601))
    print(fizz_buzz(700))
    print(fizz_buzz(701))
    print(fizz_buzz(800))
    print(fizz_buzz(801))
    print(fizz_buzz(900))
    print(fizz_buzz(901))
    print(fizz_buzz(1000))
    print(fizz_bu