In [154]:
import os
import pandas as pd
import numpy as np
import collections
import json
from string import punctuation
import jieba
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [198]:
class Build_dict():
    def __init__(self):
        self.dict = {}
        self.dir = ''
        self.context = []
        self.total_words = []
        self.word_dict = {}
        self.get_stopwords()
    
    def get_stopwords(self):
        with open('ingetrated_stopwords.txt', 'r', encoding='utf8') as f:
            self.stopwords = f.read().split('\n')
        
    def read_json(self):
        files = os.listdir(self.dir)
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(self.dir, file)
                with open(file_path, 'r', encoding='utf8') as f:
                    data = json.load(f)
                    data = data['data'][0]['paragraphs']
                    self.context = [item['context'] for item in data]
                print(file + '已被读取。')
                
    def split_corpus(self):
        print('分词工作具体处理')
                
    def build_dict(self):
        self.word_dict = collections.Counter(self.total_words).most_common()
        print('完成词典构建')
    
    def write_dict(self, path, data):
        with open(path, 'w', encoding='utf8') as f:
            for item in data:
                f.write(item[0]+' '+str(item[1])+'\n')
            print('成功保存文件')
                     
        
class Build_Squad(Build_dict):
    def __init__(self):
        super(Build_Squad, self).__init__()
        self.dir = 'squad'
        
    def split_corpus(self):
        lemmatizer = WordNetLemmatizer()
        punctuations = punctuation + '~！？，。：；’‘“”【】、|（）——…… \n\t\r'
        for sentence in self.context:
            sentence = re.sub('\d+', '<NUM>', sentence)  #将数字换成特殊标记
            self.total_words.extend([lemmatizer.lemmatize(word.strip(punctuations)) for word in sentence.lower().split(' ') if word not in self.stopwords])
        print('完成分词')
          
    def process(self):
        self.read_json()
        self.split_corpus()
        self.build_dict() 
        self.write_dict(self.dir+'.txt', self.word_dict) 
        
class Build_dureader(Build_dict):
    def __init__(self):
        super(Build_dureader, self).__init__()
        self.dir = 'dureader'
        
    def split_corpus(self):
        punctuations = punctuation + '~！？，。：；’‘“”【】、|（）——…… \n\t\r' 
        for sentence in self.context:
            sentence = re.sub('\d+', '<NUM>', sentence)  #将数字换成特殊标记
            sentence = jieba.cut(sentence)
            self.total_words.extend([word.strip().strip(punctuations) for word in sentence 
                                     if (word.strip(punctuations) and word not in self.stopwords)]) #防止去除符号后为空
        print('完成分词')
    
    def process(self):
        self.read_json()
        self.split_corpus()
        self.build_dict() 
        self.write_dict(self.dir+'.txt', self.word_dict) 
    

In [27]:
a = Build_dureader()
a.process()

dev.json已被读取。
train.json已被读取。
完成分词
完成词典构建
成功保存文件


In [199]:
b = Build_Squad()
b.process()

dev-v2.0.json已被读取。
train-v2.0.json已被读取。
完成分词
完成词典构建
成功保存文件
