In [1]:

import logging
from pathlib import Path

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from accelerate import Accelerator
import tensorflow as tf



2023-07-12 12:37:47.267065: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from datasets import load_dataset

In [4]:
from torch.utils.data.dataloader import DataLoader

In [5]:
from torch.utils.data import IterableDataset

In [6]:
class ConstantLengthDataset(IterableDataset):
    def __init__(self, tokenizer, dataset, field, seq_length=1024, num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
        self.field=field

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    buffer.append(next(iterator)[self.field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    more_examples = False
                    break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [7]:
def create_dataloader(args,tokenizer):
    data_files  = {"test":args['test_bed_name']}
    valid_data = load_dataset(args['data_path'], data_files=data_files, split="test")
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data, args['field'], seq_length=args['seq_length'])
    eval_dataloader = DataLoader(valid_dataset, batch_size=args['batch_size'])
    return  eval_dataloader

In [8]:
def evaluate(args,model,eval_dataloader):
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args['batch_size'])
        losses.append(accelerator.gather(loss))

        if args['max_eval_steps'] > 0 and step >= args['max_eval_steps']:
            break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = float("inf")
    return loss.item(), perplexity.item()




In [9]:
def param_default():
    model_name = 'codeparrot-small' #<-- Scope
    test_bed_name='code_completion_dataset_3k_deduped.json'
    semeru_datases_path= '/workspaces/code-rationales/'
    data_path = Path(semeru_datases_path+'datax/' + model_name + '/')
    data_path= semeru_datases_path+'semeru-datasets/semeru/galeras/galeras_se_tasks_dataset_3k_deduplicated'
    #data_path_raw = Path('../athena-datasets/' + corpus + '/raw/')
    #tokenizer_path = Path('../tokenizer/')
    return {
        'out_processed' : '/datasets/out_processed/',
        'checkpoint_file': Path(semeru_datases_path+'data/codeparrot-small/checkpoints/checkpoint-29000'), #Model
        'output_results' : 'results/' ,
        'seed': 1,
        'data_path': data_path,
        'test_bed_name':test_bed_name,
        'seq_length': 64,
        'batch_size': 2,
        'field': "random_cut",
        'max_eval_steps':-1
    }

In [10]:
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [11]:
device

device(type='cuda', index=0)

In [12]:
torch.cuda.memory_allocated()

0

In [None]:
# Setup Accelerator
accelerator = Accelerator()
params = param_default()
# Parse configuration
set_seed(params['seed'])

# Logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
)

# Load model and tokenizer
checkpoint = params['checkpoint_file']
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = model.to( device ) #WARNING, Verify the device before assigning to memory

# Load dataset and dataloader
valid_dataset, eval_dataloader = create_dataloader(params,tokenizer)

# Prepare everything with our `accelerator`.
model, valid_dataset, eval_dataloader = accelerator.prepare(model, valid_dataset, eval_dataloader)

# Evaluate and save the last checkpoint
logger.info("Evaluating and saving model after training")
eval_loss, perplexity = evaluate(params, model, eval_dataloader)
logger.info(f"loss/eval: {eval_loss}, perplexity: {perplexity}")

## Naive test for code completion

In [35]:
#device ="cpu"

In [43]:
prompt ="def duntion_test():"
prompt="def test_frequency_condition_alone(self):\n        prev_hour = timezone.now() - timedelta(hours=1)"
params = param_default()

#torch.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained(params['checkpoint_file'])
tokenizer = AutoTokenizer.from_pretrained(params['checkpoint_file'])
model = model.to( device ) #WARNING, Verify the device before assigning to memory
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
outputs = model.generate(input_ids, do_sample=True, max_length=128)

tokenizer.batch_decode(outputs, skip_special_tokens=True)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['def test_frequency_condition_alone(self):\n        prev_hour = timezone.now() - timedelta(hours=1)\n        now = timezone.now() - timedelta(days=5, hours=30)\n        self.assertTrue(datetime.time())\n\n        now = timezone.now()\n        now1 = datetime.datetime.now().replace(hour=now)\n        with patch_timestamp(dt=datetime.now())\n\n        expected_before = date(1970, 1, 1)\n\n        # get all the microseconds given an exception with one day\n        self.assertEqual(now, datetime.now() - timedelta(']

## Outcome generation & Levenshtein evaluation

In [13]:
## This iterator is NOT working for batches > 1!!

class ConstantTokenLengthDataset(IterableDataset):
    def __init__(self, tokenizer, dataset, field, num_of_tokens=64, num_of_sequences=1024):
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.num_of_tokens = min(num_of_tokens, tokenizer.model_max_length)
        self.field=field
        self.input_char = int(self.num_of_tokens*3.6)
        self.num_of_sequences=num_of_sequences
        self.prompts=[]

    def __iter__(self):  
        for i, buffer in enumerate(self.dataset):
            size = min(len(buffer[self.field]),self.input_char)
            input = buffer[self.field][:size]
            self.prompts.append(input)
            if i > self.num_of_sequences:
                break
        tokenized_inputs = self.tokenizer(self.prompts, max_length= self.num_of_tokens, padding=True, truncation=True, return_tensors="pt")["input_ids"]
        for tokenized_input in tokenized_inputs:
            yield torch.tensor(tokenized_input)

In [14]:
def create_dataloader(args,tokenizer):
    data_files  = {"test":args['test_bed_name']}
    valid_data = load_dataset(args['data_path'], data_files=data_files, split="test")
    valid_dataset = ConstantTokenLengthDataset(tokenizer, valid_data, args['field'], num_of_tokens=args['seq_length'])
    eval_dataloader = DataLoader(valid_dataset, batch_size=1)
    return  valid_dataset, eval_dataloader

In [23]:
def generate_outcomes(args,model,eval_dataloader,valid_data):
    model.eval()
    results = []
    for step, inputs in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model.generate(inputs, do_sample=True, max_length=128,  pad_token_id=tokenizer.eos_token_id)
            outcome = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        prompt=valid_data.prompts[step]
        result = {"prompt": prompt, "outcome":outcome}
        results.append(result)
        if args['max_eval_steps'] > 0 and step >= args['max_eval_steps']:
            break
    return results


In [16]:
# Setup Accelerator
accelerator = Accelerator()
params = param_default()
# Parse configuration
set_seed(params['seed'])

# Logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
)

# Load model and tokenizer
checkpoint = params['checkpoint_file']
model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
model = model.to( device ) #WARNING, Verify the device before assigning to memory

# Load dataset and dataloader
valid_dataset, eval_dataloader = create_dataloader(params,tokenizer)

# Prepare everything with our `accelerator`.
model, valid_dataset, eval_dataloader = accelerator.prepare(model, valid_dataset, eval_dataloader)

# Evaluate and save the last checkpoint
logger.info("Evaluating and saving model after training")
outcomes = generate_outcomes(params, model, eval_dataloader,valid_dataset)
logger.info(f"outomces: {len(outcomes)}")

07/12/2023 12:38:14 - INFO - __main__ - Evaluating and saving model after training
  yield torch.tensor(tokenized_input)
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_to

In [17]:
outcomes

[{'prompt': 'def test_frequency_condition_alone(self):\n        prev_hour = timezone.now() - timedelta(hours=1)\n        group = None\n        for i in range(5):\n            group = self.store_event(\n                project_id=self.project.id, da',
  'outcome': ['def test_frequency_condition_alone(self):\n        prev_hour = timezone.now() - timedelta(hours=1)\n        group = None\n        for i in range(5):\n            group = self.store_event(\n                project_id=self.project.id, da"""\n# coding=utf-8\n"""\nTests for IRMA and ILS data from the JOS.\n"""\n\nimport os\n\nfrom.utils import MANDATORY_TYPE_SERVER\nimport hashlib\n\n\ndef get_service_name(name):\n    """\n    Return a']},
 {'prompt': 'def test_expanding(data):\n    modin_series, _ = create_',
  'outcome': ['def test_expanding(data):\n    modin_series, _ = create_/licenses/LICENSE-2.0.0.5"\n"""\nModule with the Joselia July 2016-2020 Phonon\n\nimport numpy import (u"\nfrom numpy import cext.models import (\n    

In [18]:
import pandas as pd
import textdistance

In [19]:
levenshtein_similarity = textdistance.levenshtein


In [25]:
lev_calc = [levenshtein_similarity.normalized_similarity(x["prompt"].strip(), x["outcome"][0].strip() ) for x in outcomes]


In [26]:
df = pd.DataFrame(outcomes)
df = df.assign(lev_sim=lev_calc)

Unnamed: 0,prompt,outcome,lev_sim
0,def test_frequency_condition_alone(self):\n ...,[def test_frequency_condition_alone(self):\n ...,0.552885
1,"def test_expanding(data):\n modin_series, _...","[def test_expanding(data):\n modin_series, ...",0.210728
2,def setup_method(self):\n self.df = Dat...,[def setup_method(self):\n self.df = Da...,0.383292
3,def test_chaining_upgraded_chords_mixed_canvas...,[def test_chaining_upgraded_chords_mixed_canva...,0.498915
4,"def _pad_spatial_dims(x, x_shape, padding):\n ...","[def _pad_spatial_dims(x, x_shape, padding):\n...",0.513575


In [28]:
df.describe()

Unnamed: 0,lev_sim
count,1026.0
mean,0.419745
std,0.120513
min,0.062696
25%,0.34242
50%,0.456175
75%,0.501191
max,0.718213
