## Web Novelのタイトルデータを保存する

もし，./ndataがない場合は`mkdir ndata`してください

In [58]:
import json
import requests
import gzip
import time
import re
from datetime import datetime
import string
from pprint import pprint
import pandas as pd
import pickle
from sudachipy import tokenizer 
from sudachipy import dictionary
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

In [11]:
data_path = './ndata/'

## Narou APIのテスト

In [6]:
url = "http://api.syosetu.com/novelapi/api/?out=json&lim=1&gzip=5"

response = requests.get(url)
response.encoding = 'gzip'

r = response.content
res_content = gzip.decompress(r).decode("utf-8")
response_json = json.loads(res_content)

pprint(response_json)

[{'allcount': 699662},
 {'all_hyoka_cnt': 11,
  'all_point': 92,
  'biggenre': 2,
  'daily_point': 2,
  'end': 1,
  'fav_novel_cnt': 78,
  'general_all_no': 156,
  'general_firstup': '2019-05-15 23:29:44',
  'general_lastup': '2019-12-28 13:27:49',
  'genre': 201,
  'gensaku': '',
  'global_point': 248,
  'impression_cnt': 29,
  'isbl': 0,
  'isgl': 0,
  'isr15': 1,
  'isstop': 0,
  'istenni': 1,
  'istensei': 1,
  'iszankoku': 1,
  'kaiwaritu': 25,
  'keyword': 'R15 残酷な描写あり 異世界転生 異世界転移 異能力バトル 冒険 近未来 人工知能 女主人公 異星 チート 新世界 ドラゴン '
             '異世界 不死身 魔法 魔術師 ナノテクノロジー 相棒',
  'length': 517191,
  'monthly_point': 86,
  'ncode': 'N9318FM',
  'novel_type': 1,
  'novelupdated_at': '2019-12-28 13:27:49',
  'pc_or_k': 2,
  'quarter_point': 208,
  'review_cnt': 0,
  'sasie_cnt': 13,
  'story': '退屈な地球を自ら飛び出した少女・荒野拓美。数千年に渡る放浪の末に\n'
           '辿り着いた新世界で、相棒であるナノテクノロジーＡＩ・タカネと共に\n'
           '新たな人生に挑む！そんな彼女の切り札は、どんな損傷も一瞬で修復し\n'
           'さらに喰らった生き物の特性を自分のものにするチートな体。\n'
           '地味だけど便利なこの能力と創意工夫

## Ncodeを生成する

In [13]:
def get_ncode_all():
    '''
    なろうのncodeを作成するGenerator関数
    n0000a から n9999zzまでの文字列を順に作成し，ncodeを返す
    今回はn9999aまでを返すようにする
    '''
    narou_ncode = 'n'
    before_ncode_character = string.ascii_lowercase
    after_ncode_character = string.ascii_lowercase
    ncode_number = 0
    # roop valiable i , j
    i = 0
    j = 0

    while True:
        ncode_number = str(ncode_number).zfill(4)
        if j == 0:
            ncode = narou_ncode + ncode_number + before_ncode_character[i]
        else:
            ncode = narou_ncode + ncode_number + before_ncode_character[i] + after_ncode_character[j]

        ncode_number = int(ncode_number)
        yield ncode

        ncode_cap = ncode.upper()

        # 今回の上限を設定する
        if ncode_cap == 'N9999A':
            break

        # ncodeのxxxx[a-z][a-z]を判別する
        if ncode_number == 9999:
            i += 1
            if i == 26:
                if j == 26:
                    break
                j += 1
                i = 0
            ncode_number = 0

        ncode_number += 1

In [21]:
narou_title_dic = dict()

try:
    with open(data_path+"narou_title_dic.pkl") as f:
        narou_title_dic = pickle.load(f)
        print('作成された辞書を使用します．')
except:
    print('新規辞書データを作成し，使用します．')

新規辞書データを作成し，使用します．


In [23]:
%%time

count = 0
for narou_ncode in get_ncode_all():
    
    url_before = "http://api.syosetu.com/novelapi/api/?out=json&gzip=5&ncode="
    url = url_before + narou_ncode
    
    if narou_ncode not in narou_title_dic:
        time.sleep(1.09)
        s = requests.Session()
        response = s.get(url)
        response.encoding = 'gzip'
        r = response.content
        res_content = gzip.decompress(r).decode("utf-8")
        response_json = json.loads(res_content)

        if response_json[0]['allcount'] == 1:
            narou_title_dic[str(narou_ncode)] = response_json[1]['title']
#             print(narou_title_dic[narou_ncode])
        else:
            narou_title_dic[str(narou_ncode)] = '0'

    
    if count % 1000 == 0:
        print("now : ",datetime.now().strftime("%Y/%m/%d %H:%M:%S")," count : ",count)
        
    count += 1

now :  2019/12/28 20:25:48  count :  0
now :  2019/12/28 20:31:29  count :  1000
now :  2019/12/28 20:38:10  count :  2000
now :  2019/12/28 20:45:00  count :  3000
now :  2019/12/28 20:53:22  count :  4000
now :  2019/12/28 20:59:14  count :  5000
now :  2019/12/28 21:04:58  count :  6000
now :  2019/12/28 21:10:41  count :  7000
now :  2019/12/28 21:16:22  count :  8000
now :  2019/12/28 21:22:03  count :  9000
CPU times: user 52.6 s, sys: 8.52 s, total: 1min 1s
Wall time: 1h 2min 7s


In [24]:
len(narou_title_dic)

10000

## Dictとして保存する

In [26]:
with open(data_path+"narou_title_dic.pkl","wb") as f:
    pickle.dump(narou_title_dic,f)

## 数値のkeyにした場合

In [135]:
%%time

narou_title_dic_s = dict()

for k,v in narou_title_dic.items():
    narou_title_dic_s[str(k)] = v

CPU times: user 6.14 ms, sys: 1.19 ms, total: 7.33 ms
Wall time: 8.09 ms


In [136]:
df = pd.DataFrame(narou_title_dic_s,index=["nTitle"])

In [137]:
df = df.T

In [138]:
df.head(5)

Unnamed: 0,nTitle
n0000a,0
n0001a,0
n0002a,0
n0003a,0
n0004a,0


In [139]:
df_title = df[df['nTitle'] != 0]

In [140]:
df_title.describe()

Unnamed: 0,nTitle
count,4241
unique,4167
top,赤い糸
freq,4


## DataFrameとしてTitleのみを保存する

In [141]:
with open(data_path+"narou_title_pd.pkl","wb") as f:
    pickle.dump(df_title,f)

## SudachiPyで分かち書き

In [142]:
%%time
title_wakachi_word_set = set()
df_title_words_list = []
df_title_list = df_title['nTitle'].to_list()

CPU times: user 1.79 ms, sys: 393 µs, total: 2.19 ms
Wall time: 2.89 ms


In [143]:
%%time

for df_title in df_title_list:
    # 前処理
    df_title =re.sub(r'[!-~]', "", df_title)#半角記号,数字,英字
    df_title =re.sub(r'[︰-＠]', "", df_title)#全角記号
    df_title =re.sub(r'[「」【】『』（）]', "", df_title)
    df_title =re.sub(r'[〜。、]', "", df_title)#句読点
    df_title =re.sub(r'\u3000', "", df_title)#全角スペース
    df_title =re.sub(r' ', "", df_title)#半角スペース
    df_title =re.sub('\n', "", df_title)#改行文字
    
    
    wlist = [m.surface() for m in tokenizer_obj.tokenize(df_title, mode)]
    for word in wlist:
        if word != '':
            title_wakachi_word_set.add(word)
    if wlist != []:
        df_title_words_list.append(wlist)


CPU times: user 8.45 s, sys: 227 ms, total: 8.67 s
Wall time: 10.1 s


In [144]:
len(df_title_words_list)

3936

## 語彙Setを保存する

In [145]:
with open(data_path+"narou_title_vocab.pkl","wb") as f:
    pickle.dump(title_wakachi_word_set,f)

## タイトルを分かち書きしたもの(list)を保存する

In [146]:
with open(data_path+"narou_title_words_list.pkl","wb") as f:
    pickle.dump(df_title_words_list,f)