# Упражнение

Провести эксперимент с генерацией кода с помощью модели InCoder. Исследовать влияние описания задачи, наличия юнит-тестов и т.д. на качество сгенерированного кода.

In [None]:
# код доработан из этих исходников: https://github.com/dpfried/incoder/blob/main/example_usage.py

from typing import List

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

BOS = "<|endoftext|>"
EOM = "<|endofmask|>"

BIG_MODEL = False
CUDA = True
VERBOSE = False

if BIG_MODEL:
    model_name = "facebook/incoder-6B"

    if CUDA:
        model_args = dict(
            revision="float16",
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
        )
    else:
        model_args = dict(
            low_cpu_mem_usage=True,
        )
else:
    model_name = "facebook/incoder-1B"
    model_args = {}

print("loading model")
model = AutoModelForCausalLM.from_pretrained(model_name, **model_args)
print("loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("loading complete")

if CUDA:
    model = model.half().cuda()

def _make_sentinel(i):
    # signals (1) a location to insert an infill and (2) the start of the infill generation
    return f"<|mask:{i}|>"

def _build_prompt(parts: List[str], extra_sentinel: bool):
    if len(parts) == 1:
        return parts[0]

    prompt = ""
    # encode parts separated by sentinel
    for sentinel_ix, part in enumerate(parts):
        prompt += part
        if extra_sentinel or (sentinel_ix < len(parts) - 1):
            prompt += _make_sentinel(sentinel_ix)

    return prompt

def _verbose_output():
    print("generated text:")
    print(prompt)
    print()
    print("parts:")
    print(parts)
    print()
    print("infills:")
    print(infills)
    print()
    print("restitched text:")
    print(text)
    print()

def _generate(input: str, max_to_generate: int=128, temperature: float=0.2):
    """
    Do standard left-to-right completion of the prefix `input` by sampling from the model
    """
    input_ids = tokenizer(input, return_tensors="pt").input_ids

    if CUDA:
        input_ids = input_ids.cuda()

    max_length = max_to_generate + input_ids.flatten().size(0)
    context_window_size = 2048

    if max_length > context_window_size:
        print(f"warning: max_length {max_length} is greater than the context window {context_window_size}")

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, do_sample=True,
                                top_p=0.95, temperature=temperature, max_length=max_length)

    # pass clean_up_tokenization_spaces=False to avoid removing spaces before punctuation, e.g. "from ." -> "from."
    detok_hypo_str = tokenizer.decode(output.flatten(), clean_up_tokenization_spaces=False)

    if detok_hypo_str.startswith(BOS):
        detok_hypo_str = detok_hypo_str[len(BOS):]

    return detok_hypo_str

def infill(parts: List[str], max_to_generate: int=128, temperature: float=0.2, extra_sentinel: bool=True,
           max_retries: int=1):
    """
    Generate infills to complete a partial document, e.g.
    [A C E] -> [A B C D E], where B and D are infills that have been generated.

    parts: List[str]. list of parts of the document. One string will be
            inserted in between each element, i.e. infilling N-1 locations for a list
            of length N.
    max_to_generate: int. maximum number of tokens to generate. Keep in mind
            that the model context size is 2048.
    temperature: float. temperature parameter for sampling.
    extra_sentinel: bool. we recommend setting this to True, as it makes it
            easier for the model to end generated infills. See the footnote in
            section 2.2 of our paper for details.
    max_retries: int. if > 1, use rejection sampling to keep sampling infills until
            all infills sample a completion token.

    returns a dictionary containing the following:
        text:  str, the completed document (with infills inserted)
        parts:  List[str], length N. Same as passed to the method
        infills:  List[str], length N-1. The list of infills generated
        retries_attempted:  number of retries used (if max_retries > 1)
    """
    assert isinstance(parts, list)

    retries_attempted = 0
    done = False
    infills = []
    complete = []

    while (not done) and (retries_attempted < max_retries):
        retries_attempted += 1

        if VERBOSE:
            print(f"retry {retries_attempted}")

        prompt = _build_prompt(parts, extra_sentinel)
        infills = []
        complete = []
        done = True

        for sentinel_ix, part in enumerate(parts[:-1]):
            complete.append(part)
            prompt += _make_sentinel(sentinel_ix)

            completion = _generate(prompt, max_to_generate, temperature)[len(prompt):]

            if EOM not in completion:
                if VERBOSE:
                    print(f"warning: {EOM} not found")

                completion += EOM
                done = False

            completion = completion[:completion.index(EOM) + len(EOM)]
            infilled = completion[:-len(EOM)]
            infills.append(infilled)
            complete.append(infilled)
            prompt += completion

        complete.append(parts[-1])

    if VERBOSE:
        _verbose_output()

    text = ''.join(complete)

    return {
        'text': text, # str, the completed document (with infills inserted)
        'parts': parts, # List[str], length N. Same as passed to the method
        'infills': infills, # List[str], length N-1. The list of infills generated
        'retries_attempted': retries_attempted, # number of retries used (if max_retries > 1)
    }

In [None]:
def docstring_to_code(example, max_to_generate=128, temperature=0.2):
    parts = example.split("<insert>")
    result = infill(parts, max_to_generate=max_to_generate, temperature=temperature)
    return result["text"]

## Простые примеры

In [2]:
example = '''\
def <insert>
    """ Sums numbers """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))
print()

example = '''\
def <insert>
    """ Sums numbers
    Args:
        *args
    """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))
print()

example = '''\
def <insert>
    """ Sums numbers
    Examples:
        >>> print(f(1, 2, 3))
        6

        >>> print(f(1))
        1
    """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))
print()

example = '''\
def <insert>
    """ Sums numbers
    Examples:
        >>> print(f(1, 2, 3))
        6

        >>> print(f())
        0
    """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))

NameError: name 'docstring_to_code' is not defined

Результаты:

```
def sum_numbers(numbers):
    """ Sums numbers """
    total = 0
    for number in numbers:
        total += number
    return total
<|/ file |>

def sum(*args):
    """ Sums numbers
    Args:
        *args
    """
    total = 0
    for x in args:
        total += x
    return total
<|/ file |>

def f(*args):
    """ Sums numbers
    Examples:
        >>> print(f(1, 2, 3))
        6

        >>> print(f(1))
        1
    """
    return sum(args)

def g(*args):
    """ Sums numbers
    Examples:
        >>> print(f(1, 2, 3))
        6

        >>> print(f(1))
        1
    """
    total = 0
    for arg in args:
        total += arg
    return total
<|/ file |>

def f():
    """ Sums numbers
    Examples:
        >>> print(f(1, 2, 3))
        6

        >>> print(f())
        0
    """
    sum = 0
    for num in args:
        sum += num
    return sum
<|/ file |>
```

То есть с помощью docstring получилось "заставить" сгенерировать код с заданным названием функции (последний пример), с заданным названием и "типом" аргумента (например, использовать позиционные аргументы). Иногда генерировались 2 функции (3 пример).

## Классические алгоритмы

In [None]:
example = '''\
def <insert>
    """ Checks if number is prime """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))
print()

example = '''\
def <insert>
    """ Checks if number is prime

    >>> print(<insert>(1))
    True
    """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))
print()

example = '''\
def <insert>
    """ Checks if number is prime

    >>> print(f(1))
    True
    """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))
print()

example = '''\
def <insert>
    """ Checks if number is prime

    >>> print(prime(1))
    True
    """
    <insert>
<|/ file |>'''
print(docstring_to_code(example))

```
def is_prime(number):
    """ Checks if number is prime """

    if number < 2:
        return False

    for i in range(2, number):
        if number % i == 0:
            return False
    return True
<|/ file |>

def is_prime(number):
    """ Checks if number is prime

    >>> print(is_prime(1))
    True
    """

    return number > 1 and number % 2 == 0
<|/ file |>

def prime(n):
    """ Checks if number is prime

    >>> print(f(1))
    True
    """

    if n < 2:
        return False

    if n % 2 == 0:
        return False

    for i in range(3, int(n**0.5)+1, 2):
        if n % i == 0:
            return False

    return True
<|/ file |>

def prime(n):
    """ Checks if number is prime

    >>> print(prime(1))
    True
    """

    if n < 2:
        return False

    if n % 2 == 0:
        return False

    for i in range(3, int(n ** 0.5) + 1, 2):
        if n % i == 0:
            return False

    return True
<|/ file |>
```

Очевидно, простые алгоритмы не составляют большой проблемы. Кажется, что на неверные с математической точки зрения примеры модель не обращает внимания, даже не используя предложенное название. А пытаясь не подсказывать желаемое название в примере, генерируется нечто странное.

## Прочие тесты

In [None]:
example1 = '''\
def <insert>
    """ Get all contributors of the given GitHub repo """
    <insert>
<|/ file |>'''
print(1)
print(docstring_to_code(example1))
print()

example2 = '''\
def <insert>
    """ Get all contributors of the GitHub repo """
    <insert>
<|/ file |>'''
print(2)
print(docstring_to_code(example2))
print()

example3 = '''\
def <insert>
    """ Get all contributors of the GitHub repo by URL """
    <insert>
<|/ file |>'''
print(3)
print(docstring_to_code(example3))
print()

example4 = '''\
def <insert>
    """
    Get all contributors of the GitHub repo

    >>> print(get_contributors("konygin/course_ml4se"))
    ["konygin"]
    """
    <insert>
<|/ file |>'''
print(4)
print(docstring_to_code(example4))
print()

example5 = '''\
def <insert>
    """
    Get all contributors of the GitHub repo

    >>> print(contributors("konygin/course_ml4se"))
    ["konygin"]
    """
    <insert>
<|/ file |>'''
print(5)
print(docstring_to_code(example5))
print()

example6 = '''\
def <insert>
    """
    Get all contributors of the GitHub repo using token

    >>> print(contributors("konygin/course_ml4se"))
    ["konygin"]
    """
    <insert>
<|/ file |>'''
print(6)
print(docstring_to_code(example6))
print()

example7 = '''\
def <insert>
    """
    Get all contributors of the GitHub repo

    >>> print(contributors("konygin/course_ml4se", token))
    ["konygin"]
    """
    <insert>
<|/ file |>'''
print(7)
print(docstring_to_code(example7))
print()

example8 = '''\
def <insert>
    """
    Get all contributors' logins of the GitHub repo

    >>> print(contributors("konygin/course_ml4se", token))
    ["konygin"]
    """
    <insert>
<|/ file |>'''
print(8)
print(docstring_to_code(example8))
print()

example9 = '''\
def <insert>
    """ Get all contributors' logins of the GitHub repo using token """
    <insert>
<|/ file |>'''
print(9)
print(docstring_to_code(example9))

```
1
def contributors(repo):
    """ Get all contributors of the given GitHub repo """
    contributors = []
    contributors = contributors + repo.contributors
    return contributors
<|/ file |>

2
def contributors(repo):
    """ Get all contributors of the GitHub repo """
    contributors = repo.contributors()
    return contributors
<|/ file |>

3
def contributors(repo_url):
    """ Get all contributors of the GitHub repo by URL """
    contributors = requests.get(
        repo_url,
        headers={"Accept": "application/vnd.github.v3+json"}
    ).json()
    return contributors
<|/ file |>

4
def contributors(repo):
    """
    Get all contributors of the GitHub repo

    >>> print(get_contributors("konygin/course_ml4se"))
    ["konygin"]
    """
    contributors = []
    contributors += get_contributors(repo)
    return contributors
<|/ file |>

5
def contributors(repo):
    """
    Get all contributors of the GitHub repo

    >>> print(contributors("konygin/course_ml4se"))
    ["konygin"]
    """
    contributors = []
    contributors += get_contributors(repo)
    return contributors
<|/ file |>

6
def contributors(repo):
    """
    Get all contributors of the GitHub repo using token

    >>> print(contributors("konygin/course_ml4se"))
    ["konygin"]
    """
    github_client = Github(token)
    return github_client.user.get_repos(repo)
<|/ file |>

7
def contributors(repo: str, token: str) -> list:
    """
    Get all contributors of the GitHub repo

    >>> print(contributors("konygin/course_ml4se", token))
    ["konygin"]
    """
    contributors = requests.get(f"https://api.github.com/repos/{repo}/contributors", headers={"Authorization": f"token {token}"}).json()
    return contributors
<|/ file |>

8
def contributors(repo: str, token: str = None) -> List[str]:
    """
    Get all contributors' logins of the GitHub repo

    >>> print(contributors("konygin/course_ml4se", token))
    ["konygin"]
    """
    contributors = []
    github = Github(token)
    for contributor in github.get_user().get_repos(repo=repo).get_contributors():
        contributors.append(contributor.login)
    return contributors
<|/ file |>

9
def github_contributors(token):
    """ Get all contributors' logins of the GitHub repo using token """
    contributors = []
    contributors = contributors + github_contributors(token)
    return contributors
<|/ file |>
```

Результаты получились достаточно реалистичные, но как кажется ни один из них не дал правильного ответа. Некоторые пытаются использовать библиотеку для гитхаба (кажется, одноимённую - `github`), некоторые - `requests`, а некоторые пользуются некоторыми неопределёнными функциями (или вообще скатываются в какую-то рекурсию как в последнем примере).

P.S. Самый перспективный из приведённых вариантов "запроса" - 8 номер. Периодически он действительно генерирует фрагмент кода, который правильно работает:

```
def contributors(repo, token):
    """
    Get all contributors' logins of the GitHub repo

    >>> print(contributors("konygin/course_ml4se", token))
    ["konygin"]
    """
    contributors = []
    github = Github(token)
    for contributor in github.get_repo(repo).get_contributors():
        contributors.append(contributor.login)
    return contributors
<|/ file |>
```

# В качестве вывода

`InCoder` действительно показывает хорошие результаты. Само собой на простых примерах проблем практически нет. На более крупных примеры и описание играют заметную роль. Можно повлиять на фрагмент кода упоминанием важных слов (например, URL, token). При этом модель может игнорировать некорректные примеры (хотя, наверное, сложно сказать, хорошо это или плохо - сложно). Код при этом далеко не всегда рабочий, его нужно дорабатывать. В частности может не хватать импортов.