In [1]:
import json

def jsonl_to_txt(jsonl_file, txt_file):
    """
    Convert a JSONL file containing 'correct' entries to TXT format.
    
    Args:
        jsonl_file (str): Path to input JSONL file
        txt_file (str): Path to output TXT file
    """
    try:
        formatted_lines = []
        
        # Read JSONL file line by line
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line in f:
                # Skip empty lines
                if not line.strip():
                    continue
                    
                # Parse each line as a JSON object
                try:
                    data = json.loads(line.strip())
                    if isinstance(data, dict) and 'correct' in data:
                        formatted_lines.append(f"正确形式：{data['correct']}")
                        formatted_lines.append("")  # Add empty line
                except json.JSONDecodeError:
                    print(f"警告：跳过无效的 JSON 行: {line.strip()}")
                    continue
        
        # Write the formatted text to output file
        with open(txt_file, 'w', encoding='utf-8') as f:
            f.write('\n'.join(formatted_lines))
            
        print(f"转换完成！结果已保存到：{txt_file}")
            
    except FileNotFoundError:
        print(f"错误：找不到输入文件 {jsonl_file}")
    except Exception as e:
        print(f"处理过程中发生错误：{str(e)}")

# 使用示例
jsonl_file = "output(1).jsonl"  # 输入 JSONL 文件路径
txt_file = "output(1).txt"     # 输出 TXT 文件路径
jsonl_to_txt(jsonl_file, txt_file)

转换完成！结果已保存到：output(1).txt


In [5]:
def process_text_file(input_file, output_file):
    """
    Process text file by:
    1. Remove '正确形式：' from the beginning of each line
    2. Add <p> tags at the beginning of each line
    3. Add </p> tags at the end of each line
    
    Args:
        input_file (str): Path to input text file
        output_file (str): Path to output text file
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        # Process each line
        processed_lines = []
        for line in lines:
            # Remove '正确形式：' if it exists at the start of the line
            if line.startswith('正确形式：'):
                line = line[5:]
            
            # Remove any leading/trailing whitespace
            line = line.strip()
            
            # Add <p> tags
            if line:  # Only process non-empty lines
                processed_line = f'<p>{line}</p>\n'
                processed_lines.append(processed_line)
        
        # Write to output file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.writelines(processed_lines)
            
        print(f"Successfully processed file and saved to {output_file}")
        
    except FileNotFoundError:
        print(f"Error: Could not find input file {input_file}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage
if __name__ == "__main__":
    input_file = "output(1).txt"    # Replace with your input file path
    output_file = "output(1).txt"  # Replace with your output file path
    process_text_file(input_file, output_file)

Successfully processed file and saved to output(1).txt


In [7]:
import json

def convert_to_jsonl(input_file, output_file):
    """
    Convert text file with wrong/correct text pairs to JSONL format
    
    Args:
        input_file (str): Path to input text file
        output_file (str): Path to output JSONL file
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        
        jsonl_records = []
        i = 0
        
        while i < len(lines):
            # Skip empty lines
            if not lines[i].strip():
                i += 1
                continue
                
            # Get wrong and correct lines
            wrong_line = lines[i].strip()
            correct_line = lines[i + 1].strip() if i + 1 < len(lines) else ""
            
            # Verify we have a valid pair
            if wrong_line.startswith('错误文本：') and correct_line.startswith('正确文本：'):
                # Remove prefixes
                wrong_text = wrong_line[5:]  # 5 is the length of '错误文本：'
                correct_text = correct_line[5:]  # 5 is the length of '正确文本：'
                
                # Create and append record
                record = {
                    'wrong': wrong_text.strip(),
                    'correct': correct_text.strip()
                }
                jsonl_records.append(record)
                
                # Move to next pair
                i += 2
            else:
                print(f"Warning: Invalid pair found at line {i+1}")
                i += 1
        
        # Write to JSONL file
        with open(output_file, 'w', encoding='utf-8') as f:
            for record in jsonl_records:
                f.write(json.dumps(record, ensure_ascii=False) + '\n')
                
        print(f"Successfully converted to JSONL format and saved to {output_file}")
        
    except FileNotFoundError:
        print(f"Error: Could not find input file {input_file}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

# Example usage
if __name__ == "__main__":
    input_file = "output(1)2.txt"     # Replace with your input file path
    output_file = "output(1)2.jsonl" # Replace with your output file path
    convert_to_jsonl(input_file, output_file)

Successfully converted to JSONL format and saved to output(1)2.jsonl
