In [1]:
import pandas as pd
from pathlib import Path

# 你的路径
train_path = Path('python/python/final/jsonl/train/')
test_path = Path('python/python/final/jsonl/test/')
valid_path = Path('python/python/final/jsonl/valid/')

# 列名
columns_long_list = ['repo', 'path', 'url', 'code', 
                     'code_tokens', 'docstring', 'docstring_tokens', 
                     'language', 'partition']

# 函数来加载 .jsonl.gz 文件到 DataFrame
def jsonl_list_to_dataframe(file_list, columns=columns_long_list):
    """Load a list of jsonl.gz files into a pandas DataFrame."""
    return pd.concat([pd.read_json(f, 
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

# 查找所有 .gz 文件
train_files = sorted(train_path.glob('**/*.gz'))
test_files = sorted(test_path.glob('**/*.gz'))
valid_files = sorted(valid_path.glob('**/*.gz'))

# 加载训练集、测试集、验证集
train_df = jsonl_list_to_dataframe(train_files)
test_df = jsonl_list_to_dataframe(test_files)
valid_df = jsonl_list_to_dataframe(valid_files)

# 筛选包含 `if` 条件的代码
def filter_if_statements(df):
    """Filter DataFrame to only include samples with 'if' statements in the code."""
    return df[df['code'].str.contains(r'\bif\b')]

# 筛选出包含 `if` 语句的代码
train_if_df = filter_if_statements(train_df)

# 保留训练集前 150,000 条作为预训练数据集
pretrain_data = train_if_df.iloc[:150000]

# 后 50,000 条作为微调数据集
finetune_data = train_if_df.iloc[150000:200000]

# 将微调数据集划分为训练、验证、测试集
# 80% 作为微调训练数据，10% 作为验证集，10% 作为测试集
train_finetune_data = finetune_data.iloc[:40000]  # 80% 训练集
valid_finetune_data = finetune_data.iloc[40000:45000]  # 10% 验证集
test_finetune_data = finetune_data.iloc[45000:]  # 10% 测试集

# 检查数据集划分
print("Pretrain Data: ", pretrain_data.shape)
print("Finetune Train Data: ", train_finetune_data.shape)
print("Finetune Valid Data: ", valid_finetune_data.shape)
print("Finetune Test Data: ", test_finetune_data.shape)


Pretrain Data:  (150000, 9)
Finetune Train Data:  (40000, 9)
Finetune Valid Data:  (5000, 9)
Finetune Test Data:  (5000, 9)


In [2]:
print("Train files:", train_files)
print("Test files:", test_files)
print("Valid files:", valid_files)


Train files: [WindowsPath('python/python/final/jsonl/train/python_train_0.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_1.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_10.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_11.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_12.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_13.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_2.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_3.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_4.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_5.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_6.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_7.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/python_train_8.jsonl.gz'), WindowsPath('python/python/final/jsonl/train/p

In [3]:
test_finetune_data

Unnamed: 0,repo,path,url,code,code_tokens,docstring,docstring_tokens,language,partition
18237,sffjunkie/astral,src/astral.py,https://github.com/sffjunkie/astral/blob/b0aa6...,"def _get_elevation(self, location):\n ""...","[def, _get_elevation, (, self, ,, location, ),...",Query the elevation information with the latit...,"[Query, the, elevation, information, with, the...",python,train
18239,sffjunkie/astral,src/astral.py,https://github.com/sffjunkie/astral/blob/b0aa6...,"def dawn_utc(self, date, latitude, longitude, ...","[def, dawn_utc, (, self, ,, date, ,, latitude,...",Calculate dawn time in the UTC timezone.\n\n ...,"[Calculate, dawn, time, in, the, UTC, timezone...",python,train
18240,sffjunkie/astral,src/astral.py,https://github.com/sffjunkie/astral/blob/b0aa6...,"def sunrise_utc(self, date, latitude, longitud...","[def, sunrise_utc, (, self, ,, date, ,, latitu...",Calculate sunrise time in the UTC timezone.\n\...,"[Calculate, sunrise, time, in, the, UTC, timez...",python,train
18241,sffjunkie/astral,src/astral.py,https://github.com/sffjunkie/astral/blob/b0aa6...,"def solar_noon_utc(self, date, longitude):\n ...","[def, solar_noon_utc, (, self, ,, date, ,, lon...",Calculate solar noon time in the UTC timezone....,"[Calculate, solar, noon, time, in, the, UTC, t...",python,train
18242,sffjunkie/astral,src/astral.py,https://github.com/sffjunkie/astral/blob/b0aa6...,"def sunset_utc(self, date, latitude, longitude...","[def, sunset_utc, (, self, ,, date, ,, latitud...",Calculate sunset time in the UTC timezone.\n\n...,"[Calculate, sunset, time, in, the, UTC, timezo...",python,train
...,...,...,...,...,...,...,...,...,...
25974,bitcaster-io/bitcaster,src/bitcaster/config/environ.py,https://github.com/bitcaster-io/bitcaster/blob...,"def get_value(self, var, cast=None, default=en...","[def, get_value, (, self, ,, var, ,, cast, =, ...",Return value for given environment variable.\n...,"[Return, value, for, given, environment, varia...",python,train
25975,bitcaster-io/bitcaster,src/bitcaster/utils/reflect.py,https://github.com/bitcaster-io/bitcaster/blob...,"def fqn(o):\n """"""Returns the fully qualifie...","[def, fqn, (, o, ), :, parts, =, [, ], if, isi...",Returns the fully qualified class name of an o...,"[Returns, the, fully, qualified, class, name, ...",python,train
25976,bitcaster-io/bitcaster,src/bitcaster/otp.py,https://github.com/bitcaster-io/bitcaster/blob...,"def get_otp(self, message_list):\n """"""\...","[def, get_otp, (, self, ,, message_list, ), :,...",Generates a url-safe base64 encoded encypted m...,"[Generates, a, url, -, safe, base64, encoded, ...",python,train
25977,bitcaster-io/bitcaster,src/bitcaster/otp.py,https://github.com/bitcaster-io/bitcaster/blob...,"def validate(self, cipher_text, max_timedelta=...","[def, validate, (, self, ,, cipher_text, ,, ma...",Will decrypt the url safe base64 encoded crypt...,"[Will, decrypt, the, url, safe, base64, encode...",python,train


In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# 加载分词器和模型
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm  # 导入 tqdm 进度条

# 自定义数据集类
class CodeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        code = self.data.iloc[idx]['code']
        inputs = self.tokenizer(code, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
        return inputs['input_ids'].squeeze(), inputs['attention_mask'].squeeze()


# 手动定义一个掩码 token
mask_token_id = tokenizer.convert_tokens_to_ids('<extra_id_0>')

# 使用 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 数据加载
train_dataset = CodeDataset(pretrain_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# 训练模型
model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
epochs = 3

for epoch in range(epochs):
    epoch_loss = 0
    # 使用 tqdm 显示进度条
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        # 随机遮蔽部分 token
        mask_idx = torch.rand(input_ids.shape).to(input_ids.device) < 0.15  # 随机屏蔽 15% 的 token
        input_ids[mask_idx] = mask_token_id

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1} completed. Loss: {epoch_loss/len(train_loader)}")

# 保存预训练模型
model.save_pretrained('pretrained_if_model')
tokenizer.save_pretrained('pretrained_if_tokenizer')


Epoch 1/3:   0%|          | 0/18750 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Epoch 1/3: 100%|██████████| 18750/18750 [1:05:31<00:00,  4.77it/s]


Epoch 1 completed. Loss: 0.26970973925014335


Epoch 2/3: 100%|██████████| 18750/18750 [1:04:54<00:00,  4.81it/s]


Epoch 2 completed. Loss: 0.1827282580723365


Epoch 3/3: 100%|██████████| 18750/18750 [1:04:51<00:00,  4.82it/s]

Epoch 3 completed. Loss: 0.181082252411445





('pretrained_if_tokenizer\\tokenizer_config.json',
 'pretrained_if_tokenizer\\special_tokens_map.json',
 'pretrained_if_tokenizer\\spiece.model',
 'pretrained_if_tokenizer\\added_tokens.json')

In [7]:
# 加载微调数据集
finetune_dataset = CodeDataset(train_finetune_data, tokenizer)
finetune_loader = DataLoader(finetune_dataset, batch_size=8, shuffle=True)

# 训练模型
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch in tqdm(finetune_loader, desc=f"Fine-tuning Epoch {epoch+1}/{epochs}"):
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        # 随机遮蔽部分 token
        mask_idx = torch.rand(input_ids.shape).to(input_ids.device) < 0.15  # 随机屏蔽 15% 的 token
        input_ids[mask_idx] = mask_token_id

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(f"Fine-tuning Epoch {epoch + 1} completed. Loss: {epoch_loss/len(finetune_loader)}")


Fine-tuning Epoch 1/3: 100%|██████████| 5000/5000 [17:18<00:00,  4.81it/s]


Fine-tuning Epoch 1 completed. Loss: 0.1785067981272936


Fine-tuning Epoch 2/3: 100%|██████████| 5000/5000 [17:27<00:00,  4.77it/s]


Fine-tuning Epoch 2 completed. Loss: 0.17839170552529396


Fine-tuning Epoch 3/3: 100%|██████████| 5000/5000 [17:19<00:00,  4.81it/s]

Fine-tuning Epoch 3 completed. Loss: 0.17838816022798418





In [8]:
# 验证模型
valid_dataset = CodeDataset(valid_finetune_data, tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

model.eval()
val_loss = 0
with torch.no_grad():
    for batch in tqdm(valid_loader, desc="Validating"):
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        val_loss += outputs.loss.item()

print(f"Validation Loss: {val_loss / len(valid_loader)}")


Validating: 100%|██████████| 625/625 [00:42<00:00, 14.83it/s]

Validation Loss: 0.07536070138423238





In [9]:
# 测试模型
test_dataset = CodeDataset(test_finetune_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.eval()
test_loss = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids, attention_mask = batch
        input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        test_loss += outputs.loss.item()

print(f"Test Loss: {test_loss / len(test_loader)}")


Testing: 100%|██████████| 625/625 [00:41<00:00, 14.94it/s]

Test Loss: 0.07977731139548123





In [10]:
# 保存微调模型
model.save_pretrained('finetuned_if_model')
tokenizer.save_pretrained('finetuned_if_tokenizer')


('finetuned_if_tokenizer\\tokenizer_config.json',
 'finetuned_if_tokenizer\\special_tokens_map.json',
 'finetuned_if_tokenizer\\spiece.model',
 'finetuned_if_tokenizer\\added_tokens.json')

In [11]:
# 随机选择一条原始代码样本
sample_idx = torch.randint(0, len(test_finetune_data), (1,)).item()
original_code = test_finetune_data.iloc[sample_idx]['code']

# 对原始代码进行分词并通过模型生成预测
inputs = tokenizer(original_code, return_tensors='pt', truncation=True, padding='max_length', max_length=512)
inputs = {key: value.to(device) for key, value in inputs.items()}
model.eval()

with torch.no_grad():
    outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=512)
    
# 将模型生成的预测结果解码
predicted_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

# 输出原始代码与模型生成的代码
print("Original Code:\n", original_code)
print("\nPredicted Code:\n", predicted_code)


Original Code:
 def start(self, initial_delay=0):
        """Wait for push updates from device.

        Will throw NoAsyncListenerError if no listner has been set.
        """
        if self.listener is None:
            raise exceptions.NoAsyncListenerError
        elif self._future is not None:
            return None

        # Always start with 0 to trigger an immediate response for the
        # first request
        self._atv.playstatus_revision = 0

        # This for some reason fails on travis but not in other places.
        # Why is that (same python version)?
        # pylint: disable=deprecated-method
        self._future = asyncio.ensure_future(
            self._poller(initial_delay), loop=self._loop)
        return self._future

Predicted Code:
 def start(self, initial_delay=0): """Wait for push updates from device. Will throw NoAsyncListenerError if no listner has been set. """ if self.listener is None: raise exceptions.NoAsyncListenerError elif self._future is not N

In [18]:
import pandas as pd
import torch
from tqdm import tqdm

# 加载提供的测试集
provided_test_data = pd.read_csv('sample.csv')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# 定义生成 CSV 文件的函数
def generate_testset_csv(test_data, input_col, target_col, csv_filename):
    results = []
    test_data = test_data.reset_index(drop=True)
    
    # 自定义数据集类
    class CodeDataset(Dataset):
        def __init__(self, data, tokenizer, max_length=512):
            self.data = data
            self.tokenizer = tokenizer
            self.max_length = max_length

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            code = self.data.iloc[idx][input_col]
            inputs = self.tokenizer(code, return_tensors='pt', padding='max_length', truncation=True, max_length=self.max_length)
            return inputs['input_ids'].squeeze(), inputs['attention_mask'].squeeze()

    test_dataset = CodeDataset(test_data, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    model.eval()
    if_token_id = tokenizer.convert_tokens_to_ids("if")
    
    with torch.no_grad():
        for i, batch in enumerate(tqdm(test_loader, desc=f"Processing {csv_filename}")):
            input_ids, attention_mask = batch
            input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)

            # 获取真实输入代码和期望的 if 条件
            original_code = test_data.iloc[i][input_col]
            expected_if_condition = "if" in test_data.iloc[i][target_col]

            # 生成预测代码
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512, output_scores=True, return_dict_in_generate=True)
            predicted_code = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

            # 检查预测结果是否包含 if 条件
            predicted_if_condition = "if" in predicted_code

            # 使用 logits 计算最高“if”置信度的概率得分
            scores = outputs.scores  # 生成过程中的 logits
            max_if_prob = 0
            for score in scores:
                probs = torch.softmax(score, dim=-1)
                if_prob = probs[0, if_token_id].item()
                max_if_prob = max(max_if_prob, if_prob)  # 使用最高的“if”概率

            # 转换得分为0-100范围
            prediction_score = max_if_prob * 100

            # 记录是否预测正确
            is_correct = expected_if_condition == predicted_if_condition

            # 将每条记录存储到 results 列表
            results.append({
                "Input provided to the model": original_code,
                "Whether the prediction is correct (true/false)": is_correct,
                "Expected if condition": expected_if_condition,
                "Predicted if condition": predicted_if_condition,
                "Prediction score (0-100)": prediction_score
            })

    # 转换为 DataFrame 并保存为 CSV 文件
    results_df = pd.DataFrame(results)
    results_df.to_csv(csv_filename, index=False)



# 使用提供的测试集生成 provided-testset.csv
generate_testset_csv(provided_test_data, 'input_method', 'target_block', 'provided-testset.csv')


Processing provided-testset.csv: 100%|██████████| 30/30 [01:30<00:00,  3.01s/it]


In [19]:
generate_testset_csv(test_finetune_data, 'code', 'code', 'generated-testset.csv')


Processing generated-testset.csv: 100%|██████████| 5000/5000 [2:37:29<00:00,  1.89s/it]  
