In [27]:
import pandas as pd
import re

# Load the data from the CSV file
df = pd.read_csv('data_xjp.csv')

# Define the patterns to be removed
patterns_to_remove = [
    r"新华社.*?电", 
    r"（记者.*?）",
    r"本报.*?电  （记者.*?）",
    r'《\s*人民日报\s*》（\s*\d{4}年\d{2}月\d{2}日\s*\d{2}\s*版\s*）'
    r"（记者.*?、.*?）",
    r"新华社快讯：",
    r'《.*》（\s*\d{4}年\d{2}月\d{2}日\s*\d{2}\s*版\s*）',
    r'（）',
    r'本报(.+?)(\d+月\d+日)电',
    r"\(.*?\)",  # remove all content within parentheses
]

# Remove rows with empty 'Title' or 'Content'
df = df.dropna(subset=['Title', 'Content'])
# Remove the specified patterns from the content
for pattern in patterns_to_remove:
    df['Content'] = df['Content'].apply(lambda x: re.sub(pattern, '', x))
# Remove content that is identical to the title
df = df[df['Title'] != df['Content']]



# Remove rows that contain '新华社快讯' or '人民日报' in 'Content'
# df = df[~df['Content'].str.contains('新华社快讯')]
# df = df[~df['Content'].str.contains('人民日报')]

# Remove rows where 'Content' contains less than 100 Chinese characters
df = df[df['Content'].apply(lambda x: len(re.findall(r'[\u4e00-\u9fff]', x)) >= 100)]
df['Content'] = df['Content'].str.replace(' ', '\n')
# Save the cleaned data back to a CSV file
df.to_csv('cleaned_file9.csv', index=False)


In [31]:
import pandas as pd
import json

# Load the data from the CSV file
df = pd.read_csv('cleaned_file9.csv')

# Replace all spaces with "\n" in 'Content'
df['Content'] = df['Content'].str.replace(' ', '\n')

# Add an 'input' column with empty strings
df['input'] = '政治写作'

# Reorder the columns
df = df[['Title', 'input', 'Content']]

# Convert the DataFrame to a list of dictionaries
data = df.rename(columns={'Title': 'instruction', 'Content': 'output'}).to_dict('records')

# Write the data to a JSON file
with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)
