In [None]:
!pip uninstall -q bitsandbytes
!pip install -U -q bitsandbytes

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import re
import pandas as pd
from google.colab import drive
import torch

In [None]:
drive.mount('/content/drive')

I'm going to define some basic parameters that will be same for all models. Convenient and in one place

Also, I'm downloading models with quantization as it allows to better fit into memory and doesnt affect performance much

In [None]:
device = 'cuda'
quantization_config = BitsAndBytesConfig(load_in_8bit=True)

In [None]:
parameteres = {
    'max_new_tokens': 130,
    'repetition_penalty': 1.20,
    'top_k': 50,
    'top_p': 0.1,
    'temperature': 0.2,
    'do_sample': True
}

Tiny Starcoder is fast and literally "tiny" enough to run as it is

In [None]:
# You might want to commend the biggest model as it doesnt fit into the memory with other models together
models = {
          'tiny_starcoder_py': {'quantization': False},
          'starcoder2-3b': {'quantization': True},
          'starcoder2-7b': {'quantization': True},
          'starcoder2-15b': {'quantization': True},
          }

In [None]:
# Set your path for dataset with extracted code
dataset_path = "/content/drive/MyDrive/JB2024/PythonDatasetLine.csv"
df = pd.read_csv(dataset_path)
df.head(3) # just to be sure

In [None]:
# Set your path for dataset with extended extracted code + completions
save_path = "/content/drive/MyDrive/JB2024/PythonDatasetLineExtended.csv"

Here I decided to write codes to format code from dataset into needed formating for the model. For starcode based models (and some of others) we use <fim_*> keywords to define different parts of code. So here we format prompt and extract needed parts from generated one

In [None]:
def format_prompt(prefix, suffix):
    return f"""<fim_prefix>{prefix}<fim_suffix>{suffix}<fim_middle>"""

def extract_fim_parts(text):
    try:
        # Extracting the prefix between <fim_prefix> and <fim_suffix>
        prefix_match = re.search(r'<fim_prefix>(.*?)<fim_suffix>', text, re.DOTALL)
        prefix_text = prefix_match.group(1).strip() if prefix_match else ""

        # Extracting the suffix between <fim_suffix> and <fim_middle>
        suffix_match = re.search(r'<fim_suffix>(.*?)<fim_middle>', text, re.DOTALL)
        suffix_text = suffix_match.group(1).strip() if suffix_match else ""

        # Extracting the middle part between <fim_middle> and <file_sep> or end of text
        middle_match = re.search(r'<fim_middle>(.*?)(<file_sep>|<\|endoftext\|>|$)', text, re.DOTALL)
        middle_text = middle_match.group(1).strip() if middle_match else ""

        # some debugging information
        print("EXTRACTED PARTS:")
        print("Prefix:", prefix_text[:100], "..." if len(prefix_text) > 100 else "")
        print("Middle:", middle_text[:100], "..." if len(middle_text) > 100 else "")
        print("Suffix:", suffix_text[:100], "..." if len(suffix_text) > 100 else "")

        return prefix_text, middle_text, suffix_text
    except Exception as e:
        print(f"Error extracting parts: {e}")
        return "", "", ""

Two main functions:


1.   First initializes model and tokenizer and returns them
2.   Second generates code for each middle part with given paremeters and saves to csv (this is esential as I had some moments where due to the error my generated code wasnt saved). You can switch on and off the verbose by the way in this one



In [None]:
def initialize_model_and_tokenizer(model_name, config, quantization_config=None, device='cpu'):
    checkpoint = f'bigcode/{model_name}'
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    if config.get('quantization', False):
        model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=quantization_config)
    else:
        model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

    return model, tokenizer

def generate_code_snippet(model, tokenizer, dataset, output_column, device='cpu', verbose=True):
    for index, row in dataset.iterrows():
        prompt = format_prompt(row['prefix'], row['suffix'])
        inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)

        outputs = model.generate(
            inputs,
            pad_token_id=tokenizer.eos_token_id,
            attention_mask=torch.ones_like(inputs),
            **parameteres
        )

        prefix, middle, suffix = extract_fim_parts(tokenizer.decode(outputs[0]))
        print(f"MIDDLE PART: {middle}")

        if verbose:
            index_str = f"\033[90m{str(index + 1)}\033[00m"
            middle_colored = f"\033[90m{middle}\033[00m"
            filename_colored = f"\033[90m{row['filename']}\033[00m"
            print(f"Index: {index_str}\n"
                  f"Code:\n{prefix[-250:].lstrip()}{middle_colored}{suffix[:250].rstrip()}\n"
                  f"Filename: {filename_colored}\n")

        dataset.at[index, output_column] = middle
        dataset.to_csv(save_path, index=False)

Finally, generating completions for each model and for each middle part. The main loop

In [None]:
for model_name, config in models.items():
    gen_column = 'gen_' + model_name.replace('-', '_')
    df[gen_column] = ''  # init new column for each model
    # print(quantization_config)

    # init model and tokenizer
    model, tokenizer = initialize_model_and_tokenizer(model_name, config, quantization_config, device='cuda')

    # generate code and update the column for each model
    generate_code_snippet(model, tokenizer, df, output_column=gen_column, device='cuda')

Some service code for terminating runtime in order not to waste resources

In [None]:
from google.colab import runtime
runtime.unassign()

To sum up, we defined params, defined models we want to use, wrote functions to prepara prompt and work with data, initialized models (and tokenizers) and finally generated completion. After that we will analyze them