In [1]:
import datetime
import gzip
# mecab-python3 installed from pip
import MeCab
import polars as pl
import urllib.request

In [2]:
# なろうAPIを叩く
# APIDOC: https://dev.syosetu.com/man/api/
url = "https://api.syosetu.com/novelapi/api/?biggenre=2&out=json"

query_params = {
    # 大ジャンル指定
    "biggenre": 2,
    # gzipレベル
    "gzip": 5,
    # 出力形式
    "out": "json",
    # 過去7日間の更新のもののみ
    "lastupdate": "thisweek",
    # ピックアップ作品のみ
    "ispickup": 1,
    # R15作品は除外
    "notr15": 1,
    # 評価が高い順
    "order": "hyoka",
    # 100件
    "lim": 100
}

req = urllib.request.Request(
    "{}?{}".format(
        url,
        urllib.parse.urlencode(query_params),
    )
)

In [3]:
# API Request
with urllib.request.urlopen(req) as res:
    decoded_res = gzip.GzipFile(fileobj=res)
    body = decoded_res.read()

In [4]:
# save json to body_responces
body_json = body.decode("utf-8")
now = datetime.datetime.now()

path = "body_responses/body_json_{}.json".format(now.strftime("%Y%m%d%H%M%S"))
with open(path, mode="w") as file:
    file.write(body_json)

In [5]:
# Read JSON as polar dataframe
# 1行目は件数なので除外
df_body = pl.read_json(path).with_row_count("row_number").filter(pl.col("row_number") > 1).sort("global_point", reverse=True)
df_body.head()

row_number,allcount,title,ncode,userid,writer,story,biggenre,genre,gensaku,keyword,general_firstup,general_lastup,novel_type,end,general_all_no,length,time,isstop,isr15,isbl,isgl,iszankoku,istensei,istenni,pc_or_k,global_point,daily_point,weekly_point,monthly_point,quarter_point,yearly_point,fav_novel_cnt,impression_cnt,review_cnt,all_point,all_hyoka_cnt,sasie_cnt,kaiwaritu,novelupdated_at,updated_at
u32,i64,str,str,i64,str,str,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
2,,"""【連載版】双子の姉が神子とし...","""N7146EG""",64980,"""池中織奈""","""【書籍5巻＆コミック3巻20...",2,201,"""""","""神子 捨てられ 姉 獣人 双...","""2017-09-20 19:...","""2023-02-14 20:...",1,1,385,825836,1652,1,0,0,0,0,0,0,2,116400,10,60,186,760,4790,45573,758,3,25254,2924,0,31,"""2023-02-14 20:...","""2023-02-14 21:..."
3,,"""異世界からの企業進出！？転職...","""N1881DN""",305917,"""七士七海""","""　この度めでたく皆様の応援も...",2,202,"""""","""異世界転移 日常 青春 冒険...","""2016-09-09 22:...","""2023-02-14 23:...",1,1,632,3523954,7048,0,0,0,0,0,0,1,2,68760,8,94,426,1270,6432,20720,1640,4,27320,2923,0,35,"""2023-02-14 23:...","""2023-02-14 23:..."
4,,"""弓と剣""","""N3017BO""",317870,"""淳A""","""ヴィジャヤン伯爵家三男サダは...",2,201,"""""","""""","""2013-03-14 11:...","""2023-02-14 01:...",1,1,476,2192630,4386,0,0,0,0,0,0,0,2,65009,48,156,276,1980,3502,18901,2370,10,27207,2956,0,32,"""2023-02-14 01:...","""2023-02-14 20:..."
5,,"""優しい家族と、たくさんのもふ...","""N3782FU""",1749196,"""ありぽん""","""　＊ツギクルブックス様より、...",2,201,"""""","""異世界転生 ファンタジー 異...","""2019-10-07 20:...","""2023-02-12 12:...",1,1,635,1812704,3626,0,0,0,0,0,1,0,2,41317,18,102,340,1218,5038,12777,1240,1,15763,1729,0,48,"""2023-02-12 19:...","""2023-02-14 20:..."
6,,"""シーフな魔術師""","""N5035L""",81670,"""極楽とんぼ""","""盗賊《シーフ》ギルドで働いて...",2,201,"""""","""魔術師 シーフ ファンタジー...","""2010-05-19 21:...","""2023-02-14 12:...",1,1,835,1675177,3351,0,0,0,0,0,0,0,2,21392,6,38,150,490,2404,6350,1538,1,8692,984,0,20,"""2023-02-14 12:...","""2023-02-14 18:..."


In [6]:
titles = df_body.select("title").get_column("title").to_list()
stories = df_body.select("story").get_column("story").to_list()

In [17]:
# titleを形態素解析する
tagger = MeCab.Tagger("-Owakati")

titles_node = tagger.parseToNode("".join(titles))
stories_node = tagger.parseToNode("".join(stories))

In [18]:
print(titles_node)

<Swig Object of type 'MeCab::Node *' at 0x10de1d330>


In [19]:
node_values = []
while titles_node:
    node_values.append(titles_node.feature.split(","))
    titles_node = titles_node.next

In [21]:
print(node_values[12])

['名詞', '普通名詞', '助数詞可能', '*', '*', '*', 'ガツ', '月', '月', 'ガツ', '月', 'ガツ', '漢', '*', '*', 'ツ促', '基本形', 'ガツ', 'ガツ', 'ガツ', 'ガツ', '*', '*', '1', 'C3', '*']
