In [1]:
import datetime
import gzip
# mecab-python3 installed from pip
import MeCab
import polars as pl
import urllib.request

In [2]:
# なろうAPIを叩く
# APIDOC: https://dev.syosetu.com/man/api/
url = "https://api.syosetu.com/novelapi/api/?biggenre=2&out=json"

query_params = {
    # 大ジャンル指定
    "biggenre": 2,
    # gzipレベル
    "gzip": 5,
    # 出力形式
    "out": "json",
    # 過去7日間の更新のもののみ
    "lastupdate": "thisweek",
    # ピックアップ作品のみ
    "ispickup": 1,
    # R15作品は除外
    "notr15": 1,
    # 評価が高い順
    "order": "hyoka",
    # 100件
    "lim": 100
}

req = urllib.request.Request(
    "{}?{}".format(
        url,
        urllib.parse.urlencode(query_params),
    )
)

In [3]:
# API Request
with urllib.request.urlopen(req) as res:
    decoded_res = gzip.GzipFile(fileobj=res)
    body = decoded_res.read()

In [4]:
# save json to body_responces
body_json = body.decode("utf-8")
now = datetime.datetime.now()

path = "body_responses/body_json_{}.json".format(now.strftime("%Y%m%d%H%M%S"))
with open(path, mode="w") as file:
    file.write(body_json)

In [5]:
# Read JSON as polar dataframe
# 1行目は件数なので除外
df_body = pl.read_json(path).with_row_count("row_number").filter(pl.col("row_number") > 1).sort("global_point", reverse=True)
df_body.head()

row_number,allcount,title,ncode,userid,writer,story,biggenre,genre,gensaku,keyword,general_firstup,general_lastup,novel_type,end,general_all_no,length,time,isstop,isr15,isbl,isgl,iszankoku,istensei,istenni,pc_or_k,global_point,daily_point,weekly_point,monthly_point,quarter_point,yearly_point,fav_novel_cnt,impression_cnt,review_cnt,all_point,all_hyoka_cnt,sasie_cnt,kaiwaritu,novelupdated_at,updated_at
u32,i64,str,str,i64,str,str,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
2,,"""【連載版】双子の姉が神子とし...","""N7146EG""",64980,"""池中織奈""","""【書籍5巻＆コミック3巻20...",2,201,"""""","""神子 捨てられ 姉 獣人 双...","""2017-09-20 19:...","""2023-02-14 20:...",1,1,385,825836,1652,0,0,0,0,0,0,0,2,116418,20,82,194,778,4790,45570,758,3,25278,2926,0,31,"""2023-02-14 20:...","""2023-02-16 18:..."
3,,"""どうやら私の身体は完全無敵の...","""N3881DN""",899307,"""ちゃつふさ""","""【第5回ネット小説大賞受賞】...",2,201,"""""","""異世界転生 オリジナル戦記 ...","""2016-09-13 20:...","""2023-01-28 11:...",1,1,224,1046942,2094,0,0,0,0,0,1,0,2,96389,14,86,342,1086,6014,35794,1104,8,24801,2721,2,46,"""2023-02-16 00:...","""2023-02-16 18:..."
4,,"""大賢者の愛弟子　〜防御魔法の...","""N2159DD""",604944,"""ナカノムラアヤスケ""","""数ある魔法の中で初心者向けと...",2,201,"""""","""青春 ラブコメ 防御魔法 転...","""2016-02-14 11:...","""2023-02-13 12:...",1,1,156,433836,868,0,0,0,0,0,0,0,2,92607,14,166,1582,2282,4834,33523,895,4,25561,2768,0,40,"""2023-02-13 12:...","""2023-02-16 20:..."
5,,"""【書籍版発売中】回復術士だと...","""N2614HF""",428774,"""雪車町地蔵""","""【第１０回ネット小説大賞　金...",2,201,"""""","""残酷な描写あり オリジナル戦...","""2021-09-17 17:...","""2023-02-16 21:...",1,1,78,220584,442,0,0,0,0,1,0,0,2,68922,32,312,1532,2014,7604,8799,225,5,51324,5517,0,35,"""2023-02-16 21:...","""2023-02-16 21:..."
6,,"""異世界からの企業進出！？転職...","""N1881DN""",305917,"""七士七海""","""　この度めでたく皆様の応援も...",2,202,"""""","""異世界転移 日常 青春 冒険...","""2016-09-09 22:...","""2023-02-14 23:...",1,1,632,3524184,7049,0,0,0,0,0,0,1,2,68778,6,96,440,1266,6432,20723,1648,4,27332,2924,0,35,"""2023-02-16 22:...","""2023-02-16 22:..."


In [6]:
titles = df_body.select("title").get_column("title").to_list()
stories = df_body.select("story").get_column("story").to_list()

In [7]:
# titleを形態素解析する
tagger = MeCab.Tagger("-Owakati")

titles_node = tagger.parseToNode("".join(titles))
stories_node = tagger.parseToNode("".join(stories))

In [35]:
node_values = []
temp_t = titles_node
while temp_t:
    parsed_nodes = temp_t.feature.split(",")
    parsed_nodes.append(temp_t.posid)

    node_values.append([ parsed_nodes[0], parsed_nodes[1] ])
    temp_t = temp_t.next

In [36]:
print(node_values[22])

['名詞', '普通名詞']


In [41]:
# create polars dataframe
data = node_values

df_nodes = pl.DataFrame(
    data,
    ["品詞", "p"],
)

df_nodes
df_nodes.groupby(["品詞", "p"]).agg([
    pl.col("品詞").count().alias("C"),
    pl.col("p").count().alias("PC"),
])

品詞,p,C,PC
str,str,u32,u32
"""記号""","""文字""",8,8
"""感動詞""","""フィラー""",5,5
"""接続詞""","""*""",65,65
"""助詞""","""接続助詞""",654,654
"""連体詞""","""*""",162,162
"""形容詞""","""非自立可能""",76,76
"""補助記号""","""一般""",364,364
"""名詞""","""普通名詞""",4501,4501
"""動詞""","""一般""",1052,1052
"""助詞""","""副助詞""",189,189
