In [2]:
import requests
import pandas as pd
import numpy as np
import requests
import json

In [61]:
### Get Vtuber List from Vtbs.moe
def get_vtuber():
    '''Request data from https://vtbs.moe, return a list of dict about vtuber'''

    url = "https://api.vtbs.moe/v1/info"

    payload = {}
    headers = {}

    response = requests.request("GET", url, headers=headers, data=payload)
    r = json.loads(response.text)

    ## Format data
    df = pd.DataFrame(r,columns=['uname','mid','roomid','follower','guardNum'])
    df = df.fillna(0)
    df = df.astype({"mid": int, "roomid": int, "follower": int, "guardNum": int})
    df = df.astype({"mid": str, "roomid": str})
    return df

In [119]:
df_v = get_vtuber()
df_v

Unnamed: 0,uname,mid,roomid,follower,guardNum
0,-狩猎西比尔-,1041786022,23310101,30653,24
1,-official-Spi酱,2051555906,23492749,827,0
2,超级小奈喵-,506184344,24885893,1776,0
3,-晚风铃-,356113776,23946255,5012,7
4,-水梨若official-,392101937,21745906,264,0
...,...,...,...,...,...
4963,崩坏星穹铁道,1340190821,0,2099548,0
4964,原神,401742377,21987615,15080260,24
4965,明日方舟,161775300,5555734,5087780,8
4966,崩坏3第一偶像爱酱,27534330,1319882,4164187,3


In [128]:
## Select Vtuber with rule
df_v['coefficent'] = 63.4*np.log(df_v['follower']) + df_v['guardNum'] - 790
df_v = df_v[(df_v['guardNum']>20) & (df_v['follower']>10000)]
df_vtb = df_v[df_v['coefficent'] > 0].sort_values(by='guardNum',ascending=False)
df_vtb

Unnamed: 0,uname,mid,roomid,follower,guardNum,coefficent
1376,嘉然今天吃什么,672328094,22637261,1702739,27103,27222.647267
1015,冥冥meichan,45502,22650610,555398,2262,2310.619712
4565,阿萨Aza,480680646,21696950,1242699,2107,2206.679278
4551,阿梓从小就很可爱,7706705,80397,740805,1778,1844.882238
3839,美月もも,2073012767,23698286,169656,1182,1155.432884
...,...,...,...,...,...,...
3676,米哦Official,1352646,74571,192689,24,5.503998
4964,原神,401742377,21987615,15080260,24,281.932080
4906,@黎之恒,12330804,59803,625394,23,79.145094
3840,美波七海-official,692283831,22571958,230928,22,14.981203


In [63]:
## Get Top Streamers from Bilibili
def get_topStreamer(gid,page=1,page_size=100,max_page=3):
    '''Request data from bili航海名人堂,
        - gid: type of scale
            -  241: 10000+
            -  75: 1000+ 
            -  76: 100+
        - page: start from page #
        - page_size: amount of streamers in one page
        - max_page: page number limit
        
    return a dataframe of dict about top streamers'''

    payload = {}
    headers = {
      'Cookie': 'LIVE_BUVID=AUTO1616700756065396'
    }
    url = f"https://api.live.bilibili.com/xlive/app-ucenter/v1/guard/Honor?target_id=0&gid={gid}&area_id=0&page={page}&page_size={page_size}"
    
    response = requests.request("GET", url, headers=headers, data=payload)
    r = json.loads(response.text)
    
    ### Get total number of this type of streamers
    attr = ['room_id','uid','name','guard_num','gid']
    t_num = r['data']['page']['total_count']
    t_page = int(t_num/page_size) + 1
    print(f"Streamer Type: {gid}, Total count: {t_num}, Total page num: {t_page}")
    
    ### Get data from each page
    if t_page > max_page:
        print(f"Limit page depth from {t_page} to {max_page}")
        t_page = max_page
        
    result = []
    for p in range(page, t_page+1):
        if p%20 == 0:
            print(f"Processing Page {p}...")
        url = f"https://api.live.bilibili.com/xlive/app-ucenter/v1/guard/Honor?target_id=0&gid={gid}&area_id=0&page={p}&page_size={page_size}"
        response = requests.request("GET", url, headers=headers, data=payload)
        r = json.loads(response.text)
        userList = [{k: v for k, v in u.items() if k in attr} for u in r['data']['list']]
        result += userList
    
    df_t = pd.DataFrame.from_dict(result)
    return df_t

In [64]:
top10000 = get_topStreamer(gid=241)
top1000 = get_topStreamer(gid=75)
top100 = get_topStreamer(gid=76,max_page=3)

Streamer Type: 241, Total count: 9, Total page num: 1
Streamer Type: 75, Total count: 203, Total page num: 3
Streamer Type: 76, Total count: 5886, Total page num: 59
Limit page depth from 59 to 3


In [139]:
## Merge data
pdList = [top10000, top1000, top100]  # List of dataframes
df_top = pd.concat(pdList)
df_top = df_top[df_top['name'] != '账号已注销']
df_top = df_top[df_top['guard_num'] > 10]
df_top = df_top[['name', 'uid', 'room_id', 'guard_num', 'gid']]
df_top.columns = ['uname', 'mid', 'roomid', 'guardNum', 'gid']
df_top = df_top.astype({"mid": str, "roomid": str})
df_top

Unnamed: 0,uname,mid,roomid,guardNum,gid
0,嘉然今天吃什么,672328094,22637261,16000,241
1,老实憨厚的笑笑,8739477,545068,2604,241
2,乃琳Queen,672342685,22625027,610,241
3,贝拉kira,672353429,22632424,567,241
4,向晚大魔王,672346917,22625025,519,241
...,...,...,...,...,...
295,随一Suiii,1355412269,23075997,115,76
296,全是曲奇,17013445,11328538,115,76
297,眠特Mint,1202416628,24255210,115,76
298,K1ngAzara,235794311,6571500,115,76


In [140]:
## Merge top streamers and vtubers
df_all = pd.concat([df_vtb, df_top])
df_all = df_all.drop_duplicates(subset=['mid'])
df_all = df_all.sort_values(by='guardNum',ascending=False)
df_all


Unnamed: 0,uname,mid,roomid,follower,guardNum,coefficent,gid
1376,嘉然今天吃什么,672328094,22637261,1702739.0,27103,27222.647267,
1,老实憨厚的笑笑,8739477,545068,,2604,,241.0
1015,冥冥meichan,45502,22650610,555398.0,2262,2310.619712,
1,蒋芸Mirai,114866,952192,,2225,,75.0
4565,阿萨Aza,480680646,21696950,1242699.0,2107,2206.679278,
...,...,...,...,...,...,...,...
158,吉诺儿kino,1383815813,23221095,,15,,75.0
159,阿迷今天改名了吗,395840287,22144837,,15,,75.0
160,兔总裁s,15385187,377446,,14,,75.0
161,zettaranc,326246517,11163068,,14,,75.0


In [141]:
# df_toponly = df_vtb.merge(df_top, on=['mid'], how='right', indicator=True)
# toponly = df_toponly[df_toponly['_merge']=="right_only"]

In [142]:
## Save data
df_result = df_all[df_all['guardNum']>20]
df_result.to_csv('data/streamer_info.csv',index=False)