In [1]:
import os
import pandas as pd
import re
import requests
import numpy as np
import json

In [10]:
## Get all csv files
fans_type = "fans"
directory = f"data/{fans_type}"
all_files = os.listdir(directory)
csv_files = list(filter(lambda f: f.endswith('.csv'), all_files))
# csv_files = csv_files[:50]

In [11]:
## Get all streamers' names
name_list = [f.split(f'_{fans_type}')[0] for f in csv_files]
name_list[:5]

## Get length
num = len(name_list)

In [12]:
## Calculate common fans for each pair of streamers

### Get all streamers' data
dicts = {}
for index, file in enumerate(csv_files):
    result = []
    try:
        df = pd.read_csv(f"{directory}/{file}")
    except pd.errors.EmptyDataError as e:
        print(f"No content for {index}, {name_list[index]}")
        df = pd.DataFrame(columns=['uid', 'name'],dtype=object)
    dicts[index] = {'name':name_list[index], 'data':df, 'result':result, 'type':fans_type, 'followers':df.shape[0]}

In [6]:
def transDict(dictionary, name_list):
    '''data transformation from dictionary to dataframe'''
    length = len(name_list)
    relationshipMap = {
        'src': [dictionary['name']] * length,
        'dst': name_list,
        'count': dictionary['result'],
        'type': [dictionary['type']] * length,
        'followers': [dictionary['followers']] * length
    }
    return relationshipMap

In [13]:
### Calculate common fans
for item in dicts:
    if item % 25 == 0:
        print(f"Processing {item} th, {dicts[item]['name']}")
    for idx in range(len(name_list)):
        # print(item,idx)
        if item == idx:
            dicts[item]['result'].append(0)
            continue
        duplicate_rows = pd.merge(dicts[item]['data'], dicts[idx]['data'], on=['uid'], how='inner')
        cnt = duplicate_rows.shape[0]
        dicts[item]['result'].append(cnt)
        # if cnt > 0:
            # print(item,idx,dicts[item]['name'],dicts[idx]['name'],cnt)
    
    relationshipMap = transDict(dicts[item],name_list)
    # for i in relationshipMap:
        # print(i,len(relationshipMap[i]))
    df_map = pd.DataFrame.from_dict(relationshipMap)
    df_map.to_csv(f"data/result/{dicts[item]['name']}.csv",index=False)

Processing 0 th, 一个乌龟酱
Processing 25 th, Fulgur-Official
Processing 50 th, 冰糖IO
Processing 75 th, 团小哈
Processing 100 th, 鬼叔黍
Processing 125 th, 纯爱辣妹
Processing 150 th, kk不是对儿k宝可梦
Processing 175 th, 星弥Hoshimi
Processing 200 th, 莫熙呐
Processing 225 th, 啵啵小狗341
Processing 250 th, 御酱Asahi
Processing 275 th, 乙女音Official
Processing 300 th, 温之九丶
Processing 325 th, __07_
Processing 350 th, 青寒plus
Processing 375 th, 泪腺战士
Processing 400 th, 切茜娅CheIsea
Processing 425 th, 泛式
Processing 450 th, 原神
Processing 475 th, 茂Sigeru
Processing 500 th, 小可学妹
Processing 525 th, 一米八的坤儿
Processing 550 th, 夏亚的腿毛
Processing 575 th, ❀Sakulaˇ小舞


In [14]:
## concat all csv files of streamers to one csv file
dir_path = os.listdir("data/result")    
csv_f = list(filter(lambda f: f.endswith('.csv'), dir_path))

df_all = pd.concat([pd.read_csv(f"data/result/{f}") for f in csv_f], axis=0, ignore_index=True)
df_all = df_all[df_all['count']>0]
df_all['percentage'] = round(df_all['count'] / df_all['followers'], 3)
df_all.to_csv('data/result.csv',index=False)