In [1]:
import os
import re
import sys

import MeCab

In [2]:
nm = MeCab.Tagger('-Owakati')

# txtファイルを全部読み込んで分かち書きやクリーニングをしていく
files = [f for f in os.listdir('./dataset/raw_datas') if f.split('.')[1] == 'txt']
authors = {'dazai': [], 'mori': [], 'akutagawa': []}
cleaned_data = []

In [3]:
def clean(raw_text):
    """
    ルビや入力者注を削除して一文毎に分割,分かち書き
    """
    text = raw_text.replace('\n', '').replace('\u3000', '')
    text = re.sub('[［《]', '／', text)
    text = re.sub('[］》]', '＼', text)
    text = re.sub('／[^＼]*?＼', '', text)
    text = text.replace('。', '。\n')
    text = text.replace('「', '')
    text = text.replace('」', '\n').split('\n')

    return [nm.parse(t).split(' ') for t in text if t]

for f in files:
        name = f.split('.')[0]
        print(name)
        with open('./dataset/raw_datas/'+str(f), 'r') as f:
            clean_text = clean(f.read())
            authors[name] = clean_text
            cleaned_data += clean_text

akutagawa
mori
dazai


In [4]:
def make_stopdic(lines):
    """
    空白区切りの文の集まりのテキストのリストからストップワードの辞書を作成する。
    """
    calc_words = {}
    for line in lines:
        for word in line:
            if word in calc_words:
                calc_words[str(word)] += 1
            else:
                calc_words[str(word)] = 1
    sorted_stop = sorted(calc_words.items(), key=lambda x:x[1], reverse=True)
    print("sorted_stop:"+str(len(sorted_stop)))
    freq_num = int(len(sorted_stop)*0.03)
    n = 0
    stop_words = []
    print("freq_num:"+str(freq_num))
    for data in sorted_stop:
        stop_words.append(str(data[0]))
        #print('High frequency word: ',str(data[0]), str(data[1]))
        n += 1
        if n > freq_num:
            break
    # 作成したストップワードの辞書の保存
    with open('./origin_stopwords.txt', 'w') as f:
        f.write('\n'.join(stop_words))

make_stopdic(cleaned_data)
print("cleaned_data:"+str(len(cleaned_data)))

sorted_stop:13928
freq_num:417
cleaned_data:8876


In [5]:
# ストップワードリストの作成
stop_data = {}
for f in files:
    with open('./dataset/raw_datas/'+str(f)) as f1:
        print(f)
        nm = MeCab.Tagger('-Owakati')
        text = nm.parse(f1.read()).split(' ')
        for word in text:
            if word in stop_data:
                stop_data[str(word)] += 1
            else:
                stop_data[str(word)] = 1

akutagawa.txt
mori.txt
dazai.txt


In [6]:
sorted_stop = sorted(stop_data.items(), key=lambda x:x[1], reverse=True)

print(len(sorted_stop))

freq_num = int(len(sorted_stop)*0.03)
n = 0
stop_words = []
print(freq_num)
for data in sorted_stop:
    stop_words.append(str(data[0]))
    print('High frequency word: ',str(data[0]), str(data[1]))
    n+= 1
    if n > freq_num:
        break

# 作成したストップワードの辞書の保存
with open('./origin_stopwords.txt', 'w') as f:
    f.write('\n'.join(stop_words))

14900
447
High frequency word:  、 13365
High frequency word:  の 8869
High frequency word:  。 8102
High frequency word:  に 6747
High frequency word:  は 6410
High frequency word:  て 5906
High frequency word:  を 5388
High frequency word:  た 5329
High frequency word:  《 4561
High frequency word:  》 4556
High frequency word:  が 3447
High frequency word:  と 3082
High frequency word:  で 2866
High frequency word:  し 2176
High frequency word:  も 2105
High frequency word:  　 1802
High frequency word:  」 1614
High frequency word:  「 1575
High frequency word:  ない 1194
High frequency word:  な 1170
High frequency word:  い 1092
High frequency word:  だ 1076
High frequency word:  か 1067
High frequency word:  ある 938
High frequency word:  から 894
High frequency word:  へ 842
High frequency word:  う 832
High frequency word:  まし 727
High frequency word:  いる 712
High frequency word:  その 693
High frequency word:  ます 663
High frequency word:  人 653
High frequency word:  こと 642
High frequency word:  ん 608
High f

In [7]:
def stopword_bydic(text):
    """
    辞書によるストップワードの除去
    """
    # 読み込むストップワード辞書の指定。
    with open('./origin_stopwords.txt') as f:
        data = f.read()
        stopwords = data.split('\n')
    lines = []
    for line in text:
        words = []
        for word in line:
            if word not in stopwords:
                words.append(word)    
        lines.append(words)
    return lines

In [8]:
sum_lines = len(stopword_bydic(cleaned_data))
sum_words = []
for line in stopword_bydic(cleaned_data):
    for word in line:
        if word not in sum_words:
            sum_words.append(word)
print('sum_lines: ', sum_lines)
print('sum_words: ', len(sum_words))
with open('origin_words.txt', 'w') as f:
    f.write('\n'.join(sum_words))
cleaned_data = []
for author, data in authors.items():
    data = stopword_bydic(data)
    for line in data:
        #if len(line) > 2:
        cleaned_data.append(author+','+' '.join(line))

sum_lines:  8876
sum_words:  13489


In [9]:
# 作成したコーパスの保存
with open('./dataset/corpus.csv', 'w') as f:
    f.write(''.join(cleaned_data))