In [1]:
import os
from langchain import HuggingFaceHub, PromptTemplate, LLMChain
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig,CodeGenTokenizer,CodeGenConfig, CodeGenForCausalLM,CodeLlamaTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import dotenv_values
dotenv_values = dotenv_values()
hf_key = dotenv_values['HUGGINGFACEHUB_API_TOKEN']

In [3]:
llm = HuggingFaceHub(repo_id = 'Salesforce/codegen-6B-mono', 
                     model_kwargs = {
                         "temperature" : 1,
                         #"max_length" : 500,
                     })



In [18]:
template = '''def sum_two_numbers(num1 : int, num2 : int)->int:
    """Given two numbers, return the sum of them."""
'''

In [5]:
template

'def sum_two_numbers(num1 : int, num2 : int)->int:\n    """Given two numbers, return the sum of them."""\n'

In [6]:
llm.invoke(template, )

'    return num1+num2\n\ndef add_numbers'

In [3]:
checkpoint = "codellama/CodeLlama-34b-Python-hf"
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#configuration = CodeGenConfig(token = hf_key, trust_remote_code = True)

# Initializing a model (with random weights) from the configuration
#model = CodeGenForCausalLM(configuration).to(device)

model = AutoModelForCausalLM.from_pretrained(checkpoint,trust_remote_code=True, load_in_4bit = True,token = hf_key, device_map = "auto")


config.json: 100%|██████████| 589/589 [00:00<00:00, 1.87MB/s]
model.safetensors.index.json: 100%|██████████| 31.4k/31.4k [00:00<00:00, 39.1MB/s]
model-00001-of-00003.safetensors: 100%|██████████| 9.95G/9.95G [01:33<00:00, 107MB/s] 
model-00002-of-00003.safetensors: 100%|██████████| 9.90G/9.90G [01:33<00:00, 106MB/s]
model-00003-of-00003.safetensors: 100%|██████████| 6.18G/6.18G [01:00<00:00, 102MB/s] 
Downloading shards: 100%|██████████| 3/3 [04:07<00:00, 82.42s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:09<00:00,  3.19s/it]
generation_config.json: 100%|██████████| 116/116 [00:00<00:00, 555kB/s]


In [4]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code = True, token = hf_key)

tokenizer_config.json: 100%|██████████| 749/749 [00:00<00:00, 3.40MB/s]
tokenizer.model: 100%|██████████| 500k/500k [00:00<00:00, 20.4MB/s]
tokenizer.json: 100%|██████████| 1.84M/1.84M [00:00<00:00, 4.10MB/s]
special_tokens_map.json: 100%|██████████| 411/411 [00:00<00:00, 1.07MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
text = '''
Please complete this function
def detect_xss(http_get_request: str) -> bool: \n
""" Check if in the given http_get_request there is an XSS exploit. \n
    Consider also the possible evansions that an attacker can perform. \n
""" \n
'''

# payload = http_get_request.split('?')[1]
# parameters = list(payload.split('&'))
# couples_dict = dict(map(lambda x: x.split('='), parameters))

In [16]:
inputs = tokenizer.encode(text, return_tensors="pt").to(device)

In [17]:
completion = model.generate(inputs, 
                            max_new_tokens = 500,
                    do_sample =True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [18]:
#,truncate_before_pattern=[r"\n\n^#", "^'''", "\n\n\n"]
output = tokenizer.decode(completion[0])
print(output)

<s> 
Please complete this function
def detect_xss(http_get_request: str) -> bool: 

""" Check if in the given http_get_request there is an XSS exploit. 

    Consider also the possible evansions that an attacker can perform. 

""" 

# Please enter you code here

</s>


In [54]:
tokenizer.eos_token_id

50256

In [53]:
completion

tensor([[ 4299,  2160,    62, 11545,    62,    77, 17024,     7, 22510,    16,
          1058,   493,    11,   997,    17,  1058,   493,     8,  3784,   600,
            25,   198, 50284, 37811, 15056,   734,  3146,    11,  1441,   262,
          2160,   286,   606,   526, 15931,   198, 50284,  7783,   997,    16,
          1343,   997,    17,   198,   198,  4299, 29162,    62, 11545,    62,
            77, 17024,     7, 22510,    16,  1058,   493,    11,   997,    17,
          1058,   493,     8,  3784,   600,    25,   198, 50284, 37811, 15056,
           734,  3146,    11,  1441,   262,  1720,   286,   606,   526, 15931,
           198, 50284,  7783,   997,    16,  1635,   997,    17,   198,   198,
          4299, 14083,    62, 11545,    62,    77, 17024,     7, 22510,    16,
          1058,   493,    11,   997,    17,  1058,   493,     8,  3784,   600,
            25,   198, 50284, 37811, 15056,   734,  3146,    11,  1441,   262,
         23611,  1153,   286,   606,   526, 15931,  

In [21]:

tokenizer.special_tokens_map

{'bos_token': '<|endoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|endoftext|>',
 'additional_special_tokens': ['<|endoftext|>',
  '<fim-prefix>',
  '<fim-middle>',
  '<fim-suffix>',
  '<fim-pad>']}

In [26]:
"<|endoftext|>" in output

False