In [1]:
#加载基础模型

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

base_model = 'CodeLlama-7b-Instruct-hf'
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
#微调权重合并

from peft import PeftModel

output_dir = "code-llama-ft/checkpoint-100"
model = PeftModel.from_pretrained(model, output_dir)

In [3]:
#数据读取

import time
import pandas as pd
import torch

dataset=pd.read_csv('testdata.csv')
print(dataset.head())

torch.cuda.empty_cache()

                                                Code  \
0  public int countClumps(int[] nums)\n{\n    int...   
1  public int countClumps(int[] nums)\n{\n    int...   
2  public int countClumps(int[] nums)\n{\n    int...   
3  public int countClumps(int[] nums)\n{\n    int...   
4  public int countClumps(int[] nums)\n{\n    int...   

                                              prompt  
0  Say that a "clump" in an array is a series of ...  
1  Say that a "clump" in an array is a series of ...  
2  Say that a "clump" in an array is a series of ...  
3  Say that a "clump" in an array is a series of ...  
4  Say that a "clump" in an array is a series of ...  


In [4]:
system = "You'll be writing code that mimics this student's style using java."

PROMPT = f"<<SYS>>\\n{system}\\n<</SYS>>\\n\\n"

generate_codes=[]

for i in range(50):
    qus=dataset.loc[i,'prompt']
    PROMPT +=f"[INST] {qus.strip()} [/INST]"

    input_ids = tokenizer(PROMPT, return_tensors="pt")
    PROMPT = f"<<SYS>>\\n{system}\\n<</SYS>>\\n\\n"
    start_time = time.time()
    generated_ids = model.generate(**input_ids,max_new_tokens=200)[0]

    end_time = time.time()
    elapsed_time = end_time - start_time

    item = tokenizer.decode(generated_ids, skip_special_tokens = True)
    generate_codes.append(item)

print(generate_codes[0])
print(f"Time taken: {elapsed_time:.2f} seconds")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array. [/INST]  
public int countClumps(int[] nums)
{
    int count = 0;
    for (int i = 0; i < nums.length - 1; i++)
    {
        if (nums[i] == nums[i + 1])
        {
            int j = i + 1;
            while (j < nums.length && nums[j] == nums[i])
            {
                j++;
            }
            if (j > i + 1)
            {
                count++;
            }
            i = j;
        }
    }
    return count;
}

Time taken: 23.87 seconds


In [38]:
gen=pd.DataFrame(generate_codes)
gen.to_csv('res_c.csv')

In [74]:
import numpy as np

generate_codes=pd.read_csv('res_c.csv')
generate_codes=np.array(generate_codes)

list_g=[]

for i in range(len(generate_codes)):
    list_g.append(generate_codes[i][1])

generate_codes=list_g

In [7]:
#用对应的真实代码进行对比

ground_truth_codes=[]
for i in range(50):
    ground_truth_codes.append(dataset.loc[i,'Code'])
print(len(ground_truth_codes))

50


In [9]:
#计算CodeBLUE

from codebleu import calc_codebleu

def compute_codebleu(ground_truth_codes, generated_codes):
    params='0.25,0.25,0.25,0.25'
    lang='java'
    codebleu_score= calc_codebleu(
                        references=ground_truth_codes, predictions=generated_codes, lang=lang)
    return codebleu_score

In [40]:
results = {}
codebleu_score= compute_codebleu(ground_truth_codes, generate_codes)

print(generate_codes[0])

results['codebleu'] = codebleu_score

public int countClumps(int[] nums)
{
    int count = 0;
    for (int i = 0; i < nums.length - 1; i++)
    {
        if (nums[i] == nums[i + 1])
        {
            int j = i + 1;
            while (j < nums.length && nums[j] == nums[i])
            {
                j++;
            }
            if (j > i + 1)
            {
                count++;
            }
            i = j;
        }
    }
    return count;
}



In [41]:
print(results['codebleu'])

{'codebleu': 0.34902171883423516, 'ngram_match_score': 0.17159966359265413, 'weighted_ngram_match_score': 0.17713737697463697, 'syntax_match_score': 0.49407114624505927, 'dataflow_match_score': 0.5532786885245902}


In [22]:
code1="public int countClumps(int[] nums)\n{\nint count = 0;\nfor (int i = 0; i < nums.length - 1; i++)\n{\nif (nums[i] == nums[i + 1])\n{\nint j = i + 1;\n while (j < nums.length && nums[j] == nums[i]) \n{\nj++;\n}\n if (j > i + 1){\n count++;\n}\n i = j;}\n}\n return count;}"
code2=ground_truth_codes[0]
code1=[code1]
code2=[code2]
print(code1)
print(code2)
codebleu_score2= compute_codebleu(code1, code2)
print(codebleu_score2)

['public int countClumps(int[] nums)\n{\nint count = 0;\nfor (int i = 0; i < nums.length - 1; i++)\n{\nif (nums[i] == nums[i + 1])\n{\nint j = i + 1;\n while (j < nums.length && nums[j] == nums[i]) \n{\nj++;\n}\n if (j > i + 1){\n count++;\n}\n i = j;}\n}\n return count;}']
['public int countClumps(int[] nums)\n{\n    int clump = 0;\n    int n = 0;\n    for (; n < nums.length; n++)\n    {\n        if (nums[n+1] == n && n+1 < nums.length)\n        {\n            clump++;\n        }\n        else if (nums[n-1] == n && n-1 >= 0)\n        {\n            clump++;\n        }\n    }\n    return clump;\n}\n']
{'codebleu': 0.3635227338783046, 'ngram_match_score': 0.13458672780127023, 'weighted_ngram_match_score': 0.1491338373415776, 'syntax_match_score': 0.5370370370370371, 'dataflow_match_score': 0.6333333333333333}


In [68]:
#计算Dist-N
import nltk
#nltk.download('punkt')

import abc

class Metric():
    """
    Defines a text quality metric.
    """

    def get_name(self):
        return self.name

    @abc.abstractmethod
    def compute_metric(self, texts):
        pass

class Distinct_N(Metric):

    def __init__(self, n):
        self.n = n
        self.name = f'Distinct_{n}'

    def compute_metric(self, texts):
        return self._distinct_ngrams(texts, self.n)

    def _distinct_ngrams(self, texts, n):
        total = 0.0
        for t in texts:
            tokens = nltk.tokenize.word_tokenize(t)
            print(t)
            n_distinct = len(set(ngrams(tokens, n)))
            print(n_distinct)
            total += n_distinct/ len(tokens)

        return total / len(texts)

In [71]:
nltk.download('punkt')

## compute diversity
metrics = {'dist_1': Distinct_N(1), 
            'dist_2': Distinct_N(2), 
            'dist_3': Distinct_N(3),
}
for i, (name, metric) in enumerate(metrics.items()):
    metric_result = metric.compute_metric(generate_codes)
    results[name] = metric_result
    print(metric_result)

[nltk_data] Downloading package punkt to /root/nltk_data...


KeyboardInterrupt: 

In [72]:
#使用总结模块对大模型进行提示

system = "You'll be writing code that mimics this student's style using java."

PROMPT = f"<<SYS>>\\n{system}\\n<</SYS>>\\n\\n"

generate_codes=[]

summary_prompt="Here's your last summary of student programming habits:"
summary=''

for i in range(50):
    qus=dataset.loc[i,'prompt']
    PROMPT +=f"[INST] {qus.strip()}{summary_prompt.strip()}{summary.strip()}[/INST]"

    input_ids = tokenizer(PROMPT, return_tensors="pt")
    start_time = time.time()
    generated_ids = model.generate(**input_ids,max_new_tokens=200)[0]
    end_time = time.time()
    elapsed_time = end_time - start_time
    item = tokenizer.decode(generated_ids, skip_special_tokens = True)
    generate_codes.append(item)
    
    PROMPT = f"<<SYS>>\\n{system}\\n<</SYS>>\\n\\n"
    PROMPT +=f"[INST] {item.strip()}{summary_prompt.strip()}{summary.strip()}"
    PROMPT += f"Summarize the programming style of students in less than 3 words.[/INST]"
    
    input_ids = tokenizer(PROMPT, return_tensors="pt")
    start_time = time.time()
    generated_ids = model.generate(**input_ids,max_new_tokens=200)[0]
    end_time = time.time()
    elapsed_time = end_time - start_time
    summary = tokenizer.decode(generated_ids, skip_special_tokens = True)
    print(summary)
    
    PROMPT = f"<<SYS>>\\n{system}\\n<</SYS>>\\n\\n"

print(generate_codes[0])
print(f"Time taken: {elapsed_time:.2f} seconds")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:[/INST]  
public int countClumps(int[] nums)
{
    int count = 0;
    for (int i = 0; i < nums.length - 1; i++)
    {
        if (nums[i] == nums[i + 1])
        {
            int j = i + 1;
            while (j < nums.length && nums[j] == nums[i])
            {
                j++;
            }
            if (j > i + 1)
            {
                count++;
            }
            i = j;
        }
    }
    return count;
}Here's your last summary of student programming habits:Summarize the programming style of students in less than 3 words.[/INST]  Focuses on code, not concepts.


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:[/INST]  
public int countClumps(int[] nums)
{
    int count = 0;
    for (int i = 0; i < nums.length - 1; i++)
    {
        if (nums[i] == nums[i + 1])
        {
            int j = i + 1;
            while (j < nums.len

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an arr

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an array is a series of 2 or more adjacent elements of the same value. Return the number of clumps in the given array.Here's your last summary of student programming habits:<<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] <<SYS>>\nYou'll be writing code that mimics this student's style using java.\n<</SYS>>\n\n[INST] Say that a "clump" in an arr

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.27 GiB (GPU 0; 23.65 GiB total capacity; 11.17 GiB already allocated; 4.49 GiB free; 18.69 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [76]:
def calculate_diversity(text):
    # 将文本转换为字符列表
    chars = list(text)
    
    # 计算字符总数和唯一字符数
    num_chars = len(chars)
    unique_chars = set(chars)
    num_unique_chars = len(unique_chars)
    
    # 计算多样性指数
    diversity = num_unique_chars / num_chars
    
    return diversity


sum_gen=0
list_gen=[]
sum_act=0
list_act=[]

sum_p=0
p=0
list_p=[]

for i in range(50):
    score_gen=calculate_diversity(generate_codes[i])
    list_gen.append(score_gen)
    sum_gen=sum_gen+score_gen
    
    score_act=calculate_diversity(ground_truth_codes[i])
    list_act.append(score_act)
    sum_act=sum_act+score_act
    
    p=score_gen/score_act
    sum_p=sum_p+p
    list_p.append(p)
    
    
sum_gen=sum_gen/50
sum_act=sum_act/50

print('generate_score=',sum_gen)
print('actual_score=',sum_act)
print('p=',sum_p/50)


print('delta_p=',abs(1-sum_p/50))

generate_score= 0.13052532603481937
actual_score= 0.12385099950064055
p= 1.1312681839754704
delta_p= 0.13126818397547035
