# 处理数据

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('dataset.csv')
train_data = data[['prompt','Code']]
train_data.head(1)

Unnamed: 0,prompt,Code
0,Write a function in Java that implements the f...,public int[] zeroMax(int[] nums)\r\n{\r\n i...


In [3]:
train_data = train_data.values.tolist()
train_data[0]

['Write a function in Java that implements the following logic: Return a version of the given array where each zero value in the array is replaced by the largest odd value to the right of the zero in the array. If there is no odd value to the right of the zero, leave the zero as a zero. Hint: in solving this: you may use a second helper function if you want, say to find the largest odd value to the right of a specified position.',
 'public int[] zeroMax(int[] nums)\r\n{\r\n    int large;\r\n    for(int i = 0; i < nums.length - 1; i++)\r\n    {\r\n     if (nums[i] == 0)\r\n     {\r\n      \tlarge = 0;\r\n        for(int j = i + 1; j < nums.length; j++)\r\n        {\r\n         \t   if(nums[j] > large && nums[j] % 2 == 1)\r\n               {\r\n                \tlarge = nums[j];   \r\n               }\r\n        \r\n            if (large != 0)\r\n            {\r\n             \tnums[i] = max;   \r\n            }\r\n        }\r\n     }\r\n    }\r\n        return nums;\r\n}\r\n']

In [4]:
import random

random.shuffle(train_data)

In [5]:
import json

train_num = int(0.8 * len(train_data))

with open('train_data.jsonl', 'w') as f:
    for d in train_data[:train_num]:
        d = {
            'context':'',
            'question':d[0],
            'answer':d[1]
        }
        f.write(json.dumps(d)+'\n')
        
with open('val_data.jsonl', 'w') as f:
    for d in train_data[train_num:]:
        d = {
            'context':'',
            'question':d[0],
            'answer':d[1]
        }
        f.write(json.dumps(d)+'\n')

# fine-turning 

In [6]:
from datetime import datetime
import os
import sys

import torch
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)
from transformers import (AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM,
                          TrainingArguments, Trainer, DataCollatorForSeq2Seq)

In [7]:
# 加载自己的数据集
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='train_data.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='val_data.jsonl', split='train')

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [8]:
train_dataset[0]

{'context': '',
 'question': 'Write a function in Java that implements the following logic: Given a number n, return true if n is in the range 1..10, inclusive. Unless "outsideMode" is true, in which case return true if the number is less or equal to 1, or greater or equal to 10.',
 'answer': 'public boolean in1To10(int n, boolean outsideMode)\r\n{\r\n    if (outsideMode)\r\n    {\r\n        if (n <=1 || n >=10)\r\n        {\r\n            return true;\r\n        }\r\n    }\r\n    else if (n>=1 || n<=10)\r\n    {\r\n        return true;\r\n    }\r\n}\r\n'}

In [21]:
base_model = 'CodeLlama-7b-Instruct-hf'

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# 不微调的时候

In [22]:
prompt = """You are programming coder.

Now answer the question:

{}"""
prompts = [prompt.format(train_dataset[i]['question']) for i in [1,20,32,45,67]]

In [23]:
tokenizer.pad_token = tokenizer.eos_token

In [24]:
model_input = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

model.eval()
with torch.no_grad():
    outputs = model.generate(**model_input, max_new_tokens=300)
    outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print(outputs)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['You are programming coder.\n\nNow answer the question:\n\nGiven a string str, find all places where a three-letter combination starting with "z" and ending with "p" occurs. Return a string where for all such three-letter sequences, the middle letter has been removed. For example, a string like "zipXzap" would produce a result of "zpXzp".\n\nNote:\n\n* The input string will only contain lowercase letters.\n* The input string will have a length of at most 1000.\n\nExample:\n\nInput: "zazbzp"\nOutput: "zbzp"\n\nInput: "zazbzpz"\nOutput: "zbzpz"\n\nInput: "zazbzpzp"\nOutput: "zbzpzp"\n\nInput: "zazbzpzpz"\nOutput: "zbzpzpz"\n\nInput: "zazbzpzpzp"\nOutput: "zbzpzpzp"\n\nInput: "zazbzpzpzpz"\nOutput: "zbzpzpzpz"\n\nInput: "zazbzpzpzpzp"\nOutput: "zbzpzpzpzp"\n\nInput: "zazbzpzpzpzpz"\nOutput: "zbzpzpzpzpz"\n\nInput: "zazbzpzpzpzpzp"\nOutput: "zbzpzpzpzpzp"\n\nInput: "zazbz', 'You are programming coder.\n\nNow answer the question:\n\nGiven an array of ints, return true if every 2 that appea

# lora fine-turning

In [25]:
tokenizer.add_eos_token = True
tokenizer.pad_token_id = 0
tokenizer.padding_side = "left"

In [26]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding=False,
        return_tensors=None,
    )

    # "self-supervised learning" means the labels are also the inputs:
    result["labels"] = result["input_ids"].copy()

    return result

In [27]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""You are a powerful programming model. Your job is to answer questions about a database. You are given a question.

You must output the code that answers the question.

### Input:
{data_point["question"]}

### Response:
{data_point["answer"]}
"""
    return tokenize(full_prompt)

In [28]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [17]:
# tokenized_train_dataset[0]

In [29]:
model.train() # put model back into training mode
model = prepare_model_for_int8_training(model)

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=[
    "q_proj",
    "k_proj",
    "v_proj",
    "o_proj",
],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

In [30]:
if torch.cuda.device_count() > 1:
    # keeps Trainer from trying its own DataParallelism when more than 1 gpu is available
    model.is_parallelizable = True
    model.model_parallel = True

In [33]:
torch.cuda.device_count()

2

In [34]:
batch_size = 128
per_device_train_batch_size = 32
gradient_accumulation_steps = batch_size // per_device_train_batch_size
output_dir = "code-llama-ft"

training_args = TrainingArguments(
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        max_steps=400,
        learning_rate=3e-4,
        fp16=True,
        logging_steps=10,
        optim="adamw_torch",
        evaluation_strategy="steps", # if val_set_size > 0 else "no",
        save_strategy="steps",
        eval_steps=20,
        save_steps=20,
        output_dir=output_dir,
        load_best_model_at_end=False,
        group_by_length=True, # group sequences of roughly the same length together to speed up training
        report_to="none", # if use_wandb else "none", wandb
        run_name=f"codellama-{datetime.now().strftime('%Y-%m-%d-%H-%M')}", # if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
    ),
)

In [35]:
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())).__get__(
    model, type(model)
)
if torch.__version__ >= "2" and sys.platform != "win32":
    print("compiling the model")
    model = torch.compile(model)
trainer.train()

compiling the model


Step,Training Loss,Validation Loss
20,1.2944,1.199093
40,1.0176,0.828217
60,0.427,0.358509
80,0.2514,0.245803
100,0.2003,0.223246
120,0.169,0.241282
140,0.1265,0.257463
160,0.0843,0.304411
180,0.0529,0.385443
200,0.0386,0.403556


Checkpoint destination directory code-llama-ft/checkpoint-20 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory code-llama-ft/checkpoint-40 already exists and is non-empty.Saving will proceed but saved results may be invalid.
Checkpoint destination directory code-llama-ft/checkpoint-60 already exists and is non-empty.Saving will proceed but saved results may be invalid.


TrainOutput(global_step=400, training_loss=0.21240129575133324, metrics={'train_runtime': 6758.7791, 'train_samples_per_second': 7.575, 'train_steps_per_second': 0.059, 'total_flos': 6.113905684481311e+17, 'train_loss': 0.21240129575133324, 'epoch': 123.08})

# eval

In [1]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

base_model = 'CodeLlama-7b-Instruct-hf'
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
from peft import PeftModel

output_dir = "code-llama-ft/checkpoint-100"
model = PeftModel.from_pretrained(model, output_dir)

In [3]:
eval_prompt = """You are a powerful programming model. Your job is to answer questions about a database. You are given a question.

You must output the code that answers the question.

### Input:
Write a function in Java that takes an array and returns the sum of the numbers in the array, or 0 if the array is empty. Except the number 13 is very unlucky, so it does not count any 13, or any number that immediately follows a 13.

### Response:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt")

model.eval()
with torch.no_grad():
    outputs = model.generate(**model_input, max_new_tokens=100)[0]
    print(tokenizer.decode(outputs, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


You are a powerful programming model. Your job is to answer questions about a database. You are given a question.

You must output the code that answers the question.

### Input:
Write a function in Java that takes an array and returns the sum of the numbers in the array, or 0 if the array is empty. Except the number 13 is very unlucky, so it does not count any 13, or any number that immediately follows a 13.

### Response:
public int sum13(int[] nums)
{
    int sum = 0;
    for (int i = 0; i < nums.length; i++)
    {
        if (nums[i] != 13)
        {
            sum = sum + nums[i];
        }
        else if (nums[i] == 13 && nums[i - 1] !=
