In [31]:
import json
import jsonpath
import jieba
from collections import Counter
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/solo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
# 从json文件转换为字典
def get_json_to_dict(path):
    with open(path,'r') as f:
        content_str = f.read()
        content_dict = json.loads(content_str)
    return content_dict

# (counter类)排序后写入到文件中
def to_file(path_name,content):
    text = ''
    for tuple_pair in content:
        string,counter = tuple_pair
        text += string + ' ' + str(counter) + '\n'
    with open(path_name,'w') as f:
        f.write(text)

# 获取json文件指定到key中到所有value,返回一个list
def get_json_value(json_data,key_name):
    key_value = jsonpath.jsonpath(json_data, '$..{key_name}'.format(key_name=key_name))   
    return key_value

# (中文)输入句子列表,输出分词结果
def cn_cut_sentences_list(sentences_list):
    word_list = []
    for sentence in sentences_list:
        sentence_word_list = jieba.lcut(sentence, cut_all=False)
        word_list.extend(sentence_word_list)
    return word_list


# (英文)输入句子列表,输出分词结果.过程做了stemming
def en_cut_sentences_list(sentences_list):
    word_list = []
    lemmatizer = WordNetLemmatizer()
    for sentence in sentences_list:
        sentence_word_list = nltk.word_tokenize(sentence)
        stemed_word_list = [lemmatizer.lemmatize(word.lower()) for word in sentence_word_list ]
        word_list.extend(stemed_word_list)
    return word_list

# du_reader数据处理
def du_reader_process(path):
    du_reader_dict = get_json_to_dict(path)
    du_reader_context = get_json_value(du_reader_dict,'context')
    du_reader_question = get_json_value(du_reader_dict,'question')
    
    context_word_list = cn_cut_sentences_list(du_reader_context)
    context_word_freq = Counter(context_word_list).most_common()
    to_file('du_reader_context.txt',context_word_freq)

# squad 数据处理
def squad_process(path):
    squad_dict = get_json_to_dict(path)
    squad_context = get_json_value(squad_dict,'context')
    
    context_word_list = en_cut_sentences_list(squad_context)
    context_word_freq = Counter(context_word_list).most_common()
    to_file('squad_context.txt',context_word_freq)

In [33]:
# 获取json到字典中
du_reader_json_path = r'/Users/solo/学习/nlp学习/基于大规模预训练模型的机器阅读理解/week1/datas/demo/demo_dev.json'
squad_json_path = r'/Users/solo/学习/nlp学习/基于大规模预训练模型的机器阅读理解/week1/datas/train-v2.0.json'

In [34]:
du_reader_process(du_reader_json_path)
squad_process(squad_json_path)