In [6]:
import pandas as pd
import json
import igraph as ig
import matplotlib.pyplot as plt
import numpy as np
import leidenalg as la

In [34]:
def data_processing(df):
    df = df.sort_values('percentage',ascending=False)
    df = df.reset_index(drop=True)

    ## Add src's & dst's mid information
    df_info = pd.read_csv("data/streamer_info.csv")
    df_info = df_info[['uname','mid']]

    df = pd.merge(df,df_info,how='left',left_on='src',right_on='uname')
    df = pd.merge(df,df_info,how='left',left_on='dst',right_on='uname')

    ## Calculate the reverse of percentage from dst to src
    dict_followers = dict(zip(df.src, df.followers))
    df['followers_dst'] = df['dst'].map(dict_followers)
    df['percentage_reverse'] = df.apply(lambda x: round(x['count'] / dict_followers[x['dst']],3), axis=1)

    ## Rename columns
    df = df[['src', 'dst', 'count', 'type', 'followers', 'percentage', 'percentage_reverse', 'mid_x', 'mid_y','followers_dst']]
    return df

def data_selection(df, threshold=[200,20,0.05]):
    '''function to select data based on threshold
    threshold = [followers, count, percentage]
    '''
    data = df[(df['followers']>1000) & (df['followers_dst']>1000)]
    data = data[(data['count']>20)]
    data = data[(data['percentage']>0.03)]
    # data = data[(data['percentage']>0.05) & (data['percentage_inv']>0.05)]
    # data = data[(data['percentage']>0.05) | (data['percentage_inv']>0.05)]
    # data = data[['src','dst','count','percentage']]
    return data

In [35]:
df = pd.read_csv("data/result.csv")
df = data_processing(df)
data = data_selection(df)
data.to_excel("data/一周data截止20231223.xlsx",index=False)

In [23]:
## Create graph
print("Number of nodes: ", len(data['src'].unique()))
tuples = [tuple(x) for x in data[['src','dst','percentage']].values]
Gm = ig.Graph.TupleList(tuples, directed = True, edge_attrs = ['percentage'])

Number of nodes:  224


In [24]:
## Clustering by CPM (Community Preserving Modularity) using optimiser
optimiser = la.Optimiser()
profile = optimiser.resolution_profile(Gm, la.CPMVertexPartition, resolution_range=(0,1))

740it [00:04, 167.74it/s, resolution_parameter=0.00123]


In [25]:
len(profile)

126

In [30]:
## Select the best partition
partition = profile[113] # manually select
# partition = profile[summary['modularity'].index(max(summary['modularity']))]
print(f"resultion: {round(partition.resolution_parameter,4)}, modularity: {round(partition.modularity,5)}")
print(partition)

resultion: 0.6573, modularity: 0.12316
Clustering with 229 elements and 76 clusters
[ 0] 宣小纸不怕火, 呜米, 咩栗, 卡缇娅也不知道鸭, 小可学妹, 伊索尔Sol, 东爱璃Lovely, 李豆沙_Channel, 星汐Seki,
     礼墨Sumi, 红晓音Akane, 帅比笙歌超可爱OvO, 吉诺儿kino, 露蒂丝, 黎歌Neeko, 雪狐桑, 美月もも, 桃几OvO,
     雾深Girimi, 艾因Eine, 秋凛子Rinco, HiiroVTuber, 桃姆Q吃手手, 雪绘Yukie, 艾露露Ailurus,
     小柔Channel, 杜松子_Gin, 扇宝, 希月萌奈, 花花Haya, 茉吱Mojuko, 花园Serena, 穆小泠Official,
     折原露露, 白神遥Haruka, 星宮汐Official, 早稻叽, 小桃Channel, 钉宫妮妮Ninico, 蕾尔娜Leona, 早凉,
     梦音茶糯, 小铃久绘Official, 黑泽诺亚NOIR, 兰音Reine, 艾尔莎_Channel, 奈姬niki, 冥冥meichan,
     还有醒着的么, 陆鳐LuLu
[ 1] 心宜不是心仪, 嘉然今天吃什么, 露米Lumi_Official, 贝拉kira, 思诺snow, 向晚大魔王, 虞莫MOMO, 莞儿睡不醒,
     乃琳Queen, 又一充电中, 沐霂是MUMU呀, 恬豆发芽了, 梨安不迷路, 露早GOGO, 柚恩不加糖, 米诺高分少女, 河黎有片小叶子,
     七海Nana7mi, 永雏塔菲, 安可anko_Official, 星瞳_Official, 明前奶绿, 牧牧白miiu
[ 2] 古守血遊official, 眞白花音_Official, 椎名菜羽Official, 乙女音Official, 魔狼咪莉娅,
     夢乃栞Yumeno_Shiori, 雫るる_Official, 九重紫Official, 夏诺雅_shanoa,
     猫雷NyaRu_Official, 花丸晴琉Official, 神楽Mea_Official, 猫宫心爱
[ 3] 烤鱼子Official, 少年Pi, Yommyko