In [1]:
import datetime
import gzip
# mecab-python3 installed from pip
import MeCab
import polars as pl
import urllib.request

In [2]:
# なろうAPIを叩く
# APIDOC: https://dev.syosetu.com/man/api/
url = "https://api.syosetu.com/novelapi/api/?biggenre=2&out=json"

query_params = {
    # 大ジャンル指定
    "biggenre": 2,
    # gzipレベル
    "gzip": 5,
    # 出力形式
    "out": "json",
    # 過去7日間の更新のもののみ
    "lastupdate": "thisweek",
    # ピックアップ作品のみ
    "ispickup": 1,
    # R15作品は除外
    "notr15": 1,
    # 評価が高い順
    "order": "hyoka",
    # 100件
    "lim": 100
}

req = urllib.request.Request(
    "{}?{}".format(
        url,
        urllib.parse.urlencode(query_params),
    )
)

In [3]:
# API Request
with urllib.request.urlopen(req) as res:
    decoded_res = gzip.GzipFile(fileobj=res)
    body = decoded_res.read()

In [4]:
# save json to body_responces
body_json = body.decode("utf-8")
now = datetime.datetime.now()

path = "body_responses/body_json_{}.json".format(now.strftime("%Y%m%d%H%M%S"))
with open(path, mode="w") as file:
    file.write(body_json)

In [5]:
# Read JSON as polar dataframe
# 1行目は件数なので除外
df_body = pl.read_json(path).with_row_count("row_number").filter(pl.col("row_number") > 1).sort("global_point", reverse=True)
df_body.head()

row_number,allcount,title,ncode,userid,writer,story,biggenre,genre,gensaku,keyword,general_firstup,general_lastup,novel_type,end,general_all_no,length,time,isstop,isr15,isbl,isgl,iszankoku,istensei,istenni,pc_or_k,global_point,daily_point,weekly_point,monthly_point,quarter_point,yearly_point,fav_novel_cnt,impression_cnt,review_cnt,all_point,all_hyoka_cnt,sasie_cnt,kaiwaritu,novelupdated_at,updated_at
u32,i64,str,str,i64,str,str,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
2,,"""エノク第二部隊の遠征ごはん""","""N2542DR""",264357,"""江本マシメサ""","""遠征部隊に配属となった衛生兵...",2,201,"""""","""残酷な描写あり 日常 冒険 ...","""2016-12-05 14:...","""2022-12-20 00:...",1,1,405,1198813,2398,0,0,0,0,1,0,0,2,122227,8,154,686,2302,9510,37769,3991,6,46689,4924,3,42,"""2023-02-17 11:...","""2023-02-17 20:..."
3,,"""【連載版】双子の姉が神子とし...","""N7146EG""",64980,"""池中織奈""","""【書籍5巻＆コミック3巻20...",2,201,"""""","""神子 捨てられ 姉 獣人 双...","""2017-09-20 19:...","""2023-02-14 20:...",1,1,385,825836,1652,0,0,0,0,0,0,0,2,116418,8,92,202,776,4790,45570,758,3,25278,2926,0,31,"""2023-02-14 20:...","""2023-02-17 18:..."
4,,"""どうやら私の身体は完全無敵の...","""N3881DN""",899307,"""ちゃつふさ""","""【第5回ネット小説大賞受賞】...",2,201,"""""","""異世界転生 オリジナル戦記 ...","""2016-09-13 20:...","""2023-01-28 11:...",1,1,224,1046942,2094,0,0,0,0,0,1,0,2,96399,18,92,348,1078,6014,35794,1104,8,24811,2722,2,46,"""2023-02-16 00:...","""2023-02-17 21:..."
5,,"""大賢者の愛弟子　〜防御魔法の...","""N2159DD""",604944,"""ナカノムラアヤスケ""","""数ある魔法の中で初心者向けと...",2,201,"""""","""青春 ラブコメ 防御魔法 転...","""2016-02-14 11:...","""2023-02-13 12:...",1,1,156,433836,868,0,0,0,0,0,0,0,2,92623,20,186,1502,2300,4834,33526,895,4,25571,2769,0,40,"""2023-02-13 12:...","""2023-02-17 20:..."
6,,"""ロメリア戦記～魔王を倒した後...","""N3159FS""",785922,"""有山リョウ""","""書籍化しました。小学館ガガガ...",2,201,"""""","""残酷な描写あり オリジナル戦...","""2019-08-26 23:...","""2023-02-17 00:...",1,1,317,966877,1934,0,0,0,0,1,0,0,2,74353,20,170,780,2042,17030,22350,829,3,29653,3236,0,34,"""2023-02-17 00:...","""2023-02-17 20:..."


In [6]:
titles = df_body.select("title").get_column("title").to_list()
stories = df_body.select("story").get_column("story").to_list()

In [7]:
titles[0]

'エノク第二部隊の遠征ごはん'

In [8]:
# titleを形態素解析する
tagger = MeCab.Tagger("-Owakati")
tagger2 = MeCab.Tagger("-Owakati")

titles_node = tagger.parseToNode("".join(titles))
stories_node = tagger2.parseToNode("".join(stories))

In [9]:
titles_node_values = []
stories_node_values = []

temp_t = titles_node
while temp_t:
    parsed_nodes = [temp_t.surface]
    parsed_nodes.extend(temp_t.feature.split(","))
    parsed_nodes.append(temp_t.posid)

    titles_node_values.append([
        parsed_nodes[0],
        parsed_nodes[1],
        parsed_nodes[2],
        parsed_nodes[3],
        parsed_nodes[4],
        parsed_nodes[5],
        parsed_nodes[6],
        parsed_nodes[7]
    ])

    temp_t = temp_t.next
    
temp_t = stories_node
while temp_t:
    parsed_nodes = [temp_t.surface]
    parsed_nodes.extend(temp_t.feature.split(","))
    parsed_nodes.append(temp_t.posid)

    stories_node_values.append([
        parsed_nodes[0],
        parsed_nodes[1],
        parsed_nodes[2],
        parsed_nodes[3],
        parsed_nodes[4],
        parsed_nodes[5],
        parsed_nodes[6],
        parsed_nodes[7]
    ])

    temp_t = temp_t.next

In [10]:
titles_node_values[1]

['エノク', '名詞', '普通名詞', '一般', '*', '*', '*', 1]

In [11]:
stories_node_values[1]

['遠征', '名詞', '普通名詞', 'サ変可能', '*', '*', '*', 'エンセイ']

In [12]:
# create polars dataframe
data = titles_node_values
cols = ["surface", "品詞", "p", "a", "b", "c", "d", "e"]
df_nodes = pl.DataFrame(
    data,
    cols,
)

df_nodes.filter((pl.col("品詞") == "名詞") | (pl.col("品詞") == "形容詞")).groupby(["surface", "品詞"]).agg([
    pl.col("surface").count().alias("c")
]).sort("c", reverse=True)

surface,品詞,c
str,str,u32
"""世界""","""名詞""",43
"""転生""","""名詞""",18
"""最強""","""名詞""",11
"""魔法""","""名詞""",11
"""召喚""","""名詞""",10
"""魔王""","""名詞""",8
"""スキル""","""名詞""",8
"""勇者""","""名詞""",7
"""ライフ""","""名詞""",7
"""ゲーム""","""名詞""",6


In [13]:
# create polars dataframe
data = stories_node_values
cols = ["surface", "品詞", "p", "a", "b", "c", "d", "e"]
df_nodes = pl.DataFrame(
    data,
    cols,
)

df_nodes.filter((pl.col("品詞") == "名詞") | (pl.col("品詞") == "形容詞")).groupby(["surface", "品詞"]).agg([
    pl.col("surface").count().alias("c")
]).sort("c", reverse=True)

surface,品詞,c
str,str,u32
"""世界""","""名詞""",161
"""こと""","""名詞""",69
"""転生""","""名詞""",58
"""一""","""名詞""",43
"""主人""","""名詞""",43
"""物語""","""名詞""",36
"""ない""","""形容詞""",36
"""スキル""","""名詞""",35
"""神""","""名詞""",34
"""年""","""名詞""",33
