In [1]:
import os
import re
import joblib
from pyhanlp import *
import unicodedata
from datetime import datetime

In [None]:
data = []
content_pattern = r'"content": "(.*?)", "md5_doc_id"'
headline_pattern = r'"headline": "(.*?)", "author"'
time_pattern = r'"pub_time": "(.*?)", "content"'
source_pattern = r'"source": "(.*?)", "url"'
sources = ['新浪财经']

def remove_non_utf8(text):
    try:
        text = text.encode('utf-8').decode('utf-8')
    except UnicodeDecodeError as e:
        # 找到非 UTF-8 字符的位置并去除
        start = e.start
        end = e.end
        text = text[:start] + text[end:]
    return text

def fullwidth_to_halfwidth(text):
    # 全角字符和对应的半角字符的映射关系
    fullwidth_chars = '０１２３４５６７８９ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ，．？！＠＃＄％＾＆＊（）＿＋－＝｛｝［］｜＼：；＇＂＜＞／〜'
    halfwidth_chars = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?!"#$%^&*()_+-={}[]|\\:;\'"<>/~'

    mapping = str.maketrans(fullwidth_chars, halfwidth_chars)
    return text.translate(mapping)

PerceptronLexicalAnalyzer = JClass('com.hankcs.hanlp.model.perceptron.PerceptronLexicalAnalyzer')
analyzer = PerceptronLexicalAnalyzer()
drop_list = ['/介词', '/代词', '/副词', '/助词', '/方位词', '/时间词', '/数词', '/连词', '/量词', '/副形词', '/标点符号', '/名语素']
time_split = datetime.strptime('2019-5-1 00:00:00', '%Y-%m-%d %H:%M:%S')
for file in os.listdir('./CN_news/'):
    print('Reading file ', file)
    with open('./CN_news/' + file, 'r', encoding='utf-8') as content:
        lines = content.readlines()
    for line in lines:
        content_match = re.search(content_pattern, line)
        headline_match = re.search(headline_pattern, line)
        time_match = re.search(time_pattern, line)
        source_match = re.search(source_pattern, line)

        if source_match:
            if source_match.group(1) in sources:
                source = source_match.group(1)
            else:
                continue
        else:
            continue

        if time_match:
            time = time_match.group(1).replace('T', ' ').replace('Z', ' ').strip()[:19]
            time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
            if time_split > time:
                continue
        else:
            continue

        if content_match:
            content = content_match.group(1).replace('\u3000', '').replace('\\n', '').replace(' ', '').split('本版作者声明')[0]
            if content == '' or len(content) < 50:
                continue
        else:
            continue
            
        if headline_match:
            headline = headline_match.group(1).replace(' ', '')
            if '敬告读者：' in headline or '(广告)' in headline:
                continue
        else:
            continue

        merged = headline + ',' + content
        merged = fullwidth_to_halfwidth(merged)

        try:
            merged = analyzer.analyze(merged).translateLabels().toString().split(' ')
            merged = [word.split('/')[0].replace('[', '') for word in merged if '/' in word and not any(drop in word for drop in drop_list)]
        except:
            merged = remove_non_utf8(merged)
            merged = analyzer.analyze(merged).translateLabels().toString().split(' ')
            merged = [word.split('/')[0].replace('[', '') for word in merged if '/' in word and not any(drop in word for drop in drop_list)]
        
        data.append({'source' : source, 'time' : time, 'news' : merged})

        if len(data) == 100000:
            joblib.dump(data, './FinanceNews.list')
            print('======================')

Reading file  277.json
Reading file  145.json
Reading file  160.json
Reading file  114.json
Reading file  64.json
Reading file  259.json
Reading file  232.json
Reading file  239.json
Reading file  231.json
Reading file  35.json
