In [1]:
import datetime
import gzip
# mecab-python3 installed from pip
import MeCab
import polars as pl
import urllib.request

In [2]:
# なろうAPIを叩く
# APIDOC: https://dev.syosetu.com/man/api/
url = "https://api.syosetu.com/novelapi/api/?biggenre=2&out=json"

query_params = {
    # 大ジャンル指定
    "biggenre": 2,
    # gzipレベル
    "gzip": 5,
    # 出力形式
    "out": "json",
    # 過去7日間の更新のもののみ
    # "lastupdate": "thisweek",
    # ピックアップ作品のみ
    "ispickup": 1,
    # R15作品は除外
    "notr15": 1,
    # 評価が高い順
    "order": "hyoka",
    # 100件
    "lim": 500
}

req = urllib.request.Request(
    "{}?{}".format(
        url,
        urllib.parse.urlencode(query_params),
    )
)

In [3]:
# API Request
with urllib.request.urlopen(req) as res:
    decoded_res = gzip.GzipFile(fileobj=res)
    body = decoded_res.read()

In [4]:
# save json to body_responces
body_json = body.decode("utf-8")
now = datetime.datetime.now()

path = "body_responses/body_json_{}.json".format(now.strftime("%Y%m%d%H%M%S"))
with open(path, mode="w") as file:
    file.write(body_json)

In [5]:
# Read JSON as polar dataframe
# 1行目は件数なので除外
df_body = pl.read_json(path).with_row_count("row_number").filter(pl.col("row_number") > 1).sort("global_point", reverse=True)
len(df_body)

119

In [6]:
titles = df_body.select("title").get_column("title").to_list()
stories = df_body.select("story").get_column("story").to_list()

In [7]:
titles[0]

'【書籍化＆コミカライズ】勇者パーティーを追放された俺だが、俺から巣立ってくれたようで嬉しい。……なので大聖女、お前に追って来られては困るのだが？'

In [8]:
# titleを形態素解析する
tagger = MeCab.Tagger("-Owakati")
tagger2 = MeCab.Tagger("-Owakati")

titles_node = tagger.parseToNode("".join(titles))
stories_node = tagger2.parseToNode("".join(stories))

In [9]:
titles_node_values = []
stories_node_values = []

temp_t = titles_node
while temp_t:
    parsed_nodes = [temp_t.surface]
    parsed_nodes.extend(temp_t.feature.split(","))
    parsed_nodes.append(temp_t.posid)

    titles_node_values.append([
        parsed_nodes[0],
        parsed_nodes[1],
        parsed_nodes[2],
        parsed_nodes[3],
        parsed_nodes[4],
        parsed_nodes[5],
        parsed_nodes[6],
        parsed_nodes[7]
    ])

    temp_t = temp_t.next
    
temp_t = stories_node
while temp_t:
    parsed_nodes = [temp_t.surface]
    parsed_nodes.extend(temp_t.feature.split(","))
    parsed_nodes.append(temp_t.posid)

    stories_node_values.append([
        parsed_nodes[0],
        parsed_nodes[1],
        parsed_nodes[2],
        parsed_nodes[3],
        parsed_nodes[4],
        parsed_nodes[5],
        parsed_nodes[6],
        parsed_nodes[7]
    ])

    temp_t = temp_t.next

In [10]:
titles_node_values[1]

['【', '補助記号', '括弧開', '*', '*', '*', '*', '']

In [11]:
stories_node_values[1]

['【', '補助記号', '括弧開', '*', '*', '*', '*', '']

In [17]:
# create polars dataframe
data = titles_node_values
cols = ["surface", "品詞", "p", "a", "b", "c", "d", "e"]
df_nodes = pl.DataFrame(
    data,
    cols,
)

df_nodes.filter((pl.col("品詞") == "名詞") | (pl.col("品詞") == "形容詞")).groupby(["surface", "品詞"]).agg([
    pl.col("surface").count().alias("c")
]).filter(pl.col("c") >= 2).sort("c", reverse=True)

surface,品詞,c
str,str,u32
"""世界""","""名詞""",55
"""転生""","""名詞""",21
"""魔法""","""名詞""",14
"""最強""","""名詞""",13
"""召喚""","""名詞""",11
"""勇者""","""名詞""",10
"""冒険""","""名詞""",10
"""スキル""","""名詞""",9
"""生活""","""名詞""",8
"""ライフ""","""名詞""",8


In [13]:
# create polars dataframe
data = stories_node_values
cols = ["surface", "品詞", "p", "a", "b", "c", "d", "e"]
df_nodes = pl.DataFrame(
    data,
    cols,
)

df_nodes.filter((pl.col("品詞") == "名詞") | (pl.col("品詞") == "形容詞")).groupby(["surface", "品詞"]).agg([
    pl.col("surface").count().alias("c")
]).filter(pl.col("c") >= 2).sort("c", reverse=True)

surface,品詞,c
str,str,u32
"""世界""","""名詞""",188
"""こと""","""名詞""",85
"""転生""","""名詞""",64
"""一""","""名詞""",62
"""冒険""","""名詞""",53
"""ない""","""形容詞""",49
"""年""","""名詞""",49
"""魔法""","""名詞""",49
"""物語""","""名詞""",48
"""スキル""","""名詞""",45
