In [1]:
import datetime
import gzip
# mecab-python3 installed from pip
import MeCab
import polars as pl
import urllib.request

In [2]:
# なろうAPIを叩く
# APIDOC: https://dev.syosetu.com/man/api/
url = "https://api.syosetu.com/novelapi/api/?biggenre=2&out=json"

query_params = {
    # 大ジャンル指定
    "biggenre": 2,
    # gzipレベル
    "gzip": 5,
    # 出力形式
    "out": "json",
    # 過去7日間の更新のもののみ
    "lastupdate": "thisweek",
    # ピックアップ作品のみ
    "ispickup": 1,
    # R15作品は除外
    "notr15": 1,
    # 評価が高い順
    "order": "hyoka",
    # 100件
    "lim": 100
}

req = urllib.request.Request(
    "{}?{}".format(
        url,
        urllib.parse.urlencode(query_params),
    )
)

In [3]:
# API Request
with urllib.request.urlopen(req) as res:
    decoded_res = gzip.GzipFile(fileobj=res)
    body = decoded_res.read()

In [4]:
# save json to body_responces
body_json = body.decode("utf-8")
now = datetime.datetime.now()

path = "body_responses/body_json_{}.json".format(now.strftime("%Y%m%d%H%M%S"))
with open(path, mode="w") as file:
    file.write(body_json)

In [5]:
# Read JSON as polar dataframe
# 1行目は件数なので除外
df_body = pl.read_json(path).with_row_count("row_number").filter(pl.col("row_number") > 1).sort("global_point", reverse=True)
df_body.head()

row_number,allcount,title,ncode,userid,writer,story,biggenre,genre,gensaku,keyword,general_firstup,general_lastup,novel_type,end,general_all_no,length,time,isstop,isr15,isbl,isgl,iszankoku,istensei,istenni,pc_or_k,global_point,daily_point,weekly_point,monthly_point,quarter_point,yearly_point,fav_novel_cnt,impression_cnt,review_cnt,all_point,all_hyoka_cnt,sasie_cnt,kaiwaritu,novelupdated_at,updated_at
u32,i64,str,str,i64,str,str,i64,i64,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str
2,,"""【書籍版発売中】回復術士だと...","""N2614HF""",428774,"""雪車町地蔵""","""【第１０回ネット小説大賞　金...",2,201,"""""","""残酷な描写あり オリジナル戦...","""2021-09-17 17:...","""2023-02-15 21:...",1,1,77,218226,437,0,0,0,0,1,0,0,2,68898,24,344,1524,1982,7604,8793,224,5,51312,5516,0,35,"""2023-02-15 21:...","""2023-02-15 21:..."
3,,"""異世界からの企業進出！？転職...","""N1881DN""",305917,"""七士七海""","""　この度めでたく皆様の応援も...",2,202,"""""","""異世界転移 日常 青春 冒険...","""2016-09-09 22:...","""2023-02-14 23:...",1,1,632,3524181,7049,0,0,0,0,0,0,1,2,68776,16,86,426,1262,6432,20722,1647,4,27332,2924,0,35,"""2023-02-15 22:...","""2023-02-15 22:..."
4,,"""崖っぷち貴族の生き残り戦略""","""N3743FP""",352449,"""月汰元""","""【２０１９年１２月末、ＢＫブ...",2,201,"""""","""残酷な描写あり 異世界転生 ...","""2019-07-02 21:...","""2023-02-15 19:...",1,1,304,991452,1983,0,0,0,0,1,1,0,2,65074,4,78,418,1102,6616,13811,1646,2,37452,4051,1,45,"""2023-02-15 19:...","""2023-02-15 21:..."
5,,"""（2章完結）外れスキル「世界...","""N6300HD""",1612788,"""高野　ケイ""",""" 乱世の世の中、武力を全てと...",2,201,"""""","""オリジナル戦記 追放 ざまぁ...","""2021-08-15 11:...","""2023-02-15 13:...",1,1,135,322249,645,0,0,0,0,0,0,0,2,59256,4,92,214,926,4168,11313,384,0,36630,4019,0,57,"""2023-02-15 13:...","""2023-02-15 21:..."
6,,"""スキル『鑑定』に目覚めたので...","""N9122HC""",1612788,"""高野　ケイ""","""　俺こと、アルトは冒険者であ...",2,201,"""""","""オリジナル戦記 ラブコメ コ...","""2021-08-01 12:...","""2023-02-15 13:...",1,1,139,346237,693,0,0,0,0,0,0,0,2,43222,6,70,250,426,4570,9605,1250,0,24012,2618,0,56,"""2023-02-15 13:...","""2023-02-15 21:..."


In [13]:
titles = df_body.select("title").get_column("title").to_list()
stories = df_body.select("story").get_column("story").to_list()

In [49]:
# titleを形態素解析する
tagger = MeCab.Tagger("-Owakati")

titles_node = tagger.parseToNode("".join(titles))
stories_node = tagger.parseToNode("".join(stories))

In [50]:
print(titles_node)

<Swig Object of type 'MeCab::Node *' at 0x113db6ff0>


In [53]:
node_values = []
temp_t = titles_node
while temp_t:
    parsed_nodes = temp_t.feature.split(",")
    parsed_nodes.append(temp_t.posid)

    node_values.append(parsed_nodes)
    temp_t = temp_t.next

In [62]:
print(node_values[10])

['名詞', '普通名詞', 'サ変可能', '*', '*', '*', 'ジュショウ', '受賞', '受賞', 'ジュショー', '受賞', 'ジュショー', '漢', '*', '*', '*', '*', 'ジュショウ', 'ジュショウ', 'ジュショウ', 'ジュショウ', '*', '*', '0', 'C2', '*', 1]


In [72]:
# create polars dataframe
data = node_values[10]
df_nodes = pl.DataFrame(
    [data],
    # wip
    [
        "名詞", "普通名詞", "サ変可能", "a", "b", "c", "ジュショウ1", "受賞", "受賞2", "ジュショー", "受賞3", "ジュショー2", "漢",
        "d", "e", "f", "g", "ジュショウ2", "ジュショウ3", "ジュショウ4", "ジュショウ5", "h", "i", "0", "C2", "j", "1"
    ]
)
df_nodes

名詞,普通名詞,サ変可能,a,b,c,ジュショウ1,受賞,受賞2,ジュショー,受賞3,ジュショー2,漢,d,e,f,g,ジュショウ2,ジュショウ3,ジュショウ4,ジュショウ5,h,i,0,C2,j,1
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,i64
"""名詞""","""普通名詞""","""サ変可能""","""*""","""*""","""*""","""ジュショウ""","""受賞""","""受賞""","""ジュショー""","""受賞""","""ジュショー""","""漢""","""*""","""*""","""*""","""*""","""ジュショウ""","""ジュショウ""","""ジュショウ""","""ジュショウ""","""*""","""*""","""0""","""C2""","""*""",1
