In [None]:
repository = "OHApps"

In [None]:
import os
import json

def find_ets_files(directory):
    ets_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.ets'):
                full_path = os.path.join(root, file)
                with open(full_path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                ets_files.append({'file_path': full_path, 'content': content})
    return ets_files

def save_as_jsonl(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in data:
            json_line = json.dumps(item, ensure_ascii=False)
            f.write(json_line + '\n')

In [None]:
output_file = 'ets_files.jsonl'
ets_files = find_ets_files(repository)
save_as_jsonl(ets_files, output_file)
print(f"Saved {len(ets_files)} .ets file paths and contents to {output_file}")

### 清洗.test.ets

In [None]:
valid_ets_files = []
with open("ets_files.jsonl", "r") as f:
    ets_files = [json.loads(line) for line in f]
    for file in ets_files:
        ### 不要以.test.ets结尾的文件
        if file['file_path'].endswith('.test.ets'):
            continue
        else:
            valid_ets_files.append(file)

save_as_jsonl(valid_ets_files, "valid_ets_files.jsonl")


### 清洗raw_data为空的数据

In [None]:
with open("valid_ets_files.jsonl", 'r', encoding='utf-8') as f:
    data = f.readlines()
    ### raw_data为空
    for line in data:
        d = json.loads(line)
        if d['content'] == '':
            ## 删除该行
            data.remove(line)

with open("valid_ets_files.jsonl", 'w', encoding='utf-8') as f:
    for line in data:
        f.write(line)

### 选出评测集

In [None]:
### 选取前面10%的数据作为评测集
with open("valid_ets_files.jsonl", 'r', encoding='utf-8') as f:
    data = f.readlines()
    for i in range(int(len(data)/10)):
        with open("test.jsonl", 'a', encoding='utf-8') as f:
            f.write(data[i])

with open("train.jsonl", 'w', encoding='utf-8') as f:
    for i in range(int(len(data)/10), len(data)):
        f.write(data[i])

        

In [None]:
#### 评测集的数据需要进行切分，即将raw_data的内容切分成perv和target两部分
import re
import random
with open("test.jsonl", 'r', encoding='utf-8') as f:
    data = f.readlines()
    for line in data:
        d = json.loads(line)
        raw_data = d['content']
        ## 在某行的末尾随机进行切分，根据文件行数来切分吧，比如行数多的就多切几条数据
        lines = raw_data.split('\n')
        n = len(lines) / 100

        for i in range(int(n)):
            ## 随机选择一行进行切分
            idx = random.randint(8, len(lines)-1)
            prev = '\n'.join(lines[:idx])
            target = '\n'.join(lines[idx:])
            with open("test_split.jsonl", 'a', encoding='utf-8') as f:
                f.write(json.dumps({
                    "file_path": d['file_path'],
                    "prev": prev,
                    "target": target
                }, ensure_ascii=False) + '\n')


In [None]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class ArkTSDataset(Dataset):
    def __init__(self, input_jsonl_file, output_jsonl_file, tokenizer, max_length=1024, overlap=128):
        self.examples = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.overlap = overlap
        
        try:
            with open(input_jsonl_file, 'r', encoding='utf-8') as infile, \
                 open(output_jsonl_file, 'w', encoding='utf-8') as outfile:
                for line in infile:
                    data = json.loads(line)
                    content = data['content']
                    file_path = data['file_path']
                    chunks = self.process_content(content, file_path)
                    
                    for chunk in chunks:
                        new_data = {
                            'file_path': file_path,
                            'content': self.tokenizer.decode(chunk)
                        }
                        json.dump(new_data, outfile)
                        outfile.write('\n')
                    
                    self.examples.extend(chunks)
            print(f"Processed data saved to {output_jsonl_file}")
        except Exception as e:
            print(f"Error processing file: {e}")

    def process_content(self, content, file_path):
        # 添加特殊注释来标记 <cangjie> 代码
        content_with_markers = f"// BEGIN <arkts>\n{content}\n// END <arkts>"
        tokens = self.tokenizer.encode(content_with_markers, add_special_tokens=False)
        chunks = []
        
        for i in range(0, len(tokens), self.max_length - self.overlap):
            chunk = tokens[i:i + self.max_length - 2]  # -2 为了添加特殊标记
            chunk = [self.tokenizer.bos_token_id] + chunk + [self.tokenizer.eos_token_id]
            
            if len(chunk) < self.max_length:
                chunk = chunk + [self.tokenizer.pad_token_id] * (self.max_length - len(chunk))
            
            chunks.append(chunk)
        
        return chunks

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return {"input_ids": torch.tensor(self.examples[idx])}

# 设置和使用
import os
os.environ["http_proxy"] = "http://127.0.0.1:15777"
os.environ["https_proxy"] = "http://127.0.0.1:15777"

tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", cache_dir="./model", trust_remote_code=True)

# 创建数据集，同时处理并保存新的jsonl文件
input_file = 'valid_ets_files.jsonl'
output_file = 'processed_ets_files.jsonl'
dataset = ArkTSDataset(input_file, output_file, tokenizer)
print(f"Total number of chunks: {len(dataset)}")

# 如果您想查看处理后的数据，可以这样做：
with open(output_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 5:  # 只打印前5行作为示例
            print(json.loads(line))
        else:
            break

In [None]:
import json
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer

class ArkTSDataset(Dataset):
    def __init__(self, input_jsonl_file, output_jsonl_file, tokenizer, max_length=1024, overlap=128):
        self.examples = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.overlap = overlap
        
        with open(input_jsonl_file, 'r', encoding='utf-8') as infile, \
             open(output_jsonl_file, 'w', encoding='utf-8') as outfile:
            for line in infile:
                data = json.loads(line)
                content = data['content']
                file_path = data['file_path']
                chunks = self.process_content(content, file_path)
                
                # 保存处理后的chunks到新的jsonl文件
                for chunk in chunks:
                    new_data = {
                        'file_path': file_path,
                        'content': chunk
                    }
                    json.dump(new_data, outfile)
                    outfile.write('\n')
                
                self.examples.extend(chunks)
        print(f"Processed data saved to {output_jsonl_file}")

    def process_content(self, content, file_path):
        header = f"<｜arkts｜>\nFile: {file_path}\n"
        content_with_header = header + content
        
        tokens = self.tokenizer.encode(content_with_header, add_special_tokens=False)
        chunks = []
        
        for i in range(0, len(tokens), self.max_length - self.overlap):
            chunk = tokens[i:i + self.max_length - 2]
            
            # 如果不是第一个chunk，添加header
            if i != 0:
                header_tokens = self.tokenizer.encode(header, add_special_tokens=False)
                chunk = header_tokens + chunk[-(self.max_length - 2 - len(header_tokens)):]
            
            chunk_text = self.tokenizer.decode(chunk)
            chunk_text = f"<｜chunk_start｜>{chunk_text}<｜chunk_end｜>"
            chunks.append(chunk_text)
        
        return chunks

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return {"content": self.examples[idx]}

# 设置和使用
tokenizer = AutoTokenizer.from_pretrained('deepseek-ai/DeepSeek-Coder-V2-Lite-Base')
new_tokens = ["<｜arkts｜>", "<｜chunk_start｜>", "<｜chunk_end｜>"]
tokenizer.add_special_tokens({'additional_special_tokens': new_tokens})

# 创建数据集，同时处理并保存新的jsonl文件
input_file = 'valid_ets_files.jsonl'
output_file = 'processed_ets_train_files.jsonl'
dataset = ArkTSDataset(input_file, output_file, tokenizer)
print(f"Total number of chunks: {len(dataset)}")

# 如果您想查看处理后的数据，可以这样做：
with open(output_file, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 5:  # 只打印前5行作为示例
            print(json.loads(line))
        else:
            break

In [None]:
def load_checkpoint(checkpoint_path):
    checkpoint = torch.load(checkpoint_path)
    print(checkpoint['optimizer_state_dict'])
    print(checkpoint['model_state_dict'])

checkpoint_path = '/home/CangjieLLM/src/code/train/checkpoints/temp_checkpoint.pt'
load_checkpoint(checkpoint_path)

In [None]:
filename = 'processed_ets_files.jsonl'
with open(filename, 'r', encoding='utf-8') as f:
    data = f.readlines()
    for i, line in enumerate(data):
        if i == 7:
            d = json.loads(line)
            print(d['content'])
            break