In [1]:
import torch as t
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers

In [2]:
cache_dir ="../datax/models"

In [3]:
device = "cuda:0" if t.cuda.is_available() else "cpu"


In [4]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir=cache_dir,torch_dtype=t.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", cache_dir=cache_dir)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
prompt = "My favourite condiment is"

In [None]:
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
model.to(device)

In [None]:
generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

In [5]:
prompt = '''Just answer yes or not, is this code vulnerable?
	dev = usb_get_intfdata(interface);
	if (!dev) {
		retval = -ENODEV;
		goto exit;
	}

	/* increment our usage count for the device */
	kref_get(&dev->kref);

	/* save our object in the file's private structure */
	mutex_lock(&dev->io_mutex);
	file->private_data = dev;
	mutex_unlock(&dev->io_mutex);'''

In [6]:
model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
model.to(device)

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0): MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
      (1): MistralDecoderLayer(
        (self

In [7]:
generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
tokenizer.batch_decode(generated_ids)[0]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


"<s> Just answer yes or not, is this code vulnerable?\n\tdev = usb_get_intfdata(interface);\n\tif (!dev) {\n\t\tretval = -ENODEV;\n\t\tgoto exit;\n\t}\n\n\t/* increment our usage count for the device */\n\tkref_get(&dev->kref);\n\n\t/* save our object in the file's private structure */\n\tmutex_lock(&dev->io_mutex);\n\tfile->private_data = dev;\n\tmutex_unlock(&dev->io_mutex);\n\n\nA) yes , if we're lucky, it will eventually free dev , if not, leaked\nB) no , usb_get_intfdata(interface) might return not NULL in case the private field of dev and interface has the same value</s>"

In [None]:
import pandas as pd

In [None]:
val_data = pd.read_csv("../datax/big-vul/test.csv")

In [None]:
result_df = val_data[val_data['vul_func_with_fix'] != val_data['func_before']]

In [None]:
result_df.shape

In [None]:
result_df["Complexity"].unique()

In [None]:
result_df = result_df[result_df['Complexity']== 'High']

In [None]:
result_df['Score'].describe()

In [None]:
v_code = result_df["func_before"].tolist()

In [None]:
code = result_df["vul_func_with_fix"].tolist()

In [None]:
def buld_initial_prompt(vulnerable, non_vulnerable):
    prompt = [{"role": "user", "content":''''Your a vulnerability detector. This snippet has a high severity: "{0}" is the following code vulnerable as well?: {1} '''.format(vulnerable,non_vulnerable)}]
    return prompt

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    model_kwargs={"torch_dtype": torch.float16, "load_in_4bit": True},
    tokenizer = tokenizer
    
)

In [None]:
messages = buld_initial_prompt(v_code[3], code[3])
prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model.to(device)
outputs = pipeline(prompt, max_new_tokens=250, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)


In [None]:
print(outputs[0]["generated_text"])

In [8]:
prompt = "[INST] Explain what a Mixture of Experts is in less than 100 words. [/INST]"
inputs = tokenizer(prompt, return_tensors="pt").to(0)

output = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(output[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[INST] Explain what a Mixture of Experts is in less than 100 words. [/INST]

[INST] Explain what a Mixture of Experts is in less than 100 words. [/INST]

[INST] Explain what a Mixture of Experts is in less than 100


In [None]:
prompt

In [None]:
generated_ids = model.generate(**inputs, max_new_tokens=50, do_sample=True)
tokenizer.decode(generated_ids)[0]["generated_text"]