In [1]:
from datetime import datetime, timedelta
import dateutil.parser
import time
from os import walk

import pandas as pd
import numpy as np
import networkx as nx
import re

import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import seaborn as sns

import simplejson
import json
import math
import itertools
save = False

# 1. Knowledge graph network

In [2]:
stock_info_list = pd.read_feather('stock_list.ftr')
industry_list = pd.read_feather('industry_list.ftr')
index_industry_list = pd.read_feather('index_industry_list.ftr')
concept_list = pd.read_feather('concept_list.ftr')

In [10]:
%%time
### Human: Management ###

if not save:
    stk_managers_list = pd.read_feather('raw/stk_managers_list.ftr')
else:
    stk_managers_list = pd.read_csv('raw/stk_managers_list.csv')[['ts_code', 'name', 'title', 'birthday']]
    stk_managers_list.ts_code = stk_managers_list.ts_code.apply(lambda x: x[:6])
    stk_managers_list = stk_managers_list.merge(stock_info_list.ts_code) # filter to mainboard
    stk_managers_list.birthday = stk_managers_list.birthday.fillna(0).apply(lambda x: x if x < 2100 else x/100).apply(lambda x: x if x < 2000 else x/100).astype(int) # clean birthday
    stk_managers_list = stk_managers_list.groupby(['name', 'birthday', 'ts_code']).agg(list).reset_index() # aggregate the title information
    stk_managers_list.to_feather('raw/stk_managers_list.ftr')

stk_managers_edges = [
    {'node': managers[2], 'edge': managers[0]} 
    for managers in stk_managers_list.drop('title', axis=1).groupby(['name', 'birthday']).agg(list).reset_index().values
    if len(managers[2]) > 1
]
stk_managers_edges[1]

CPU times: user 3.37 s, sys: 63.3 ms, total: 3.43 s
Wall time: 3.43 s


{'node': ['000779', '600272'], 'edge': 'Charlie Chang'}

In [12]:
%%time
### Human: Investors ###
if not save:
    stk_holders_list = pd.read_feather('raw/stk_holders_list.ftr')
else:
    stk_holders_list = pd.read_csv('raw/stk_holders_list.csv')
    stk_holders_list.ts_code = stk_holders_list.ts_code.apply(lambda x: x[:6])
    stk_holders_list.holder_name = stk_holders_list.holder_name.str.replace('#', '')
    stk_holders_list = stk_holders_list[stk_holders_list.duplicated(['ts_code', 'holder_name'], keep=False)]
    stk_holders_list = stk_holders_list.drop('ann_date', axis=1).groupby(['ts_code', 'holder_name']).agg(list).reset_index()
    stk_holders_list.to_feather('stk_holders_list.ftr')

stk_holders_edges = [
    {'node': holders[1], 'edge': holders[0]} 
    for holders in stk_holders_list[['ts_code', 'holder_name']].groupby('holder_name').agg(list).reset_index().values
    if len(holders[1]) > 1
]
stk_holders_edges[0]

CPU times: user 1.09 s, sys: 52.7 ms, total: 1.15 s
Wall time: 1.12 s


{'node': ['000012', '000488', '000539', '001872'],
 'edge': 'BBH A/C VANGUARD EMERGING MARKETS STOCK INDEX FUND'}

In [13]:
%%time
### Business: Industries ###

industry_edges = [
    {'node': industry[2], 'edge': industry[1]+'-'+industry[0]} 
    for industry in industry_list[['industry_name', 'level', 'ts_code']].groupby(['industry_name', 'level']).agg(list).reset_index().values
    if len(industry[2]) > 1
]
industry_edges[2]

CPU times: user 21.1 ms, sys: 1.82 ms, total: 22.9 ms
Wall time: 21.6 ms


{'node': ['000861', '000882', '000007'], 'edge': 'L3-一般物业经营'}

In [14]:
### Business: Concepts ###

concept_edges = [
    {'node': concept[1], 'edge': concept[0]} 
    for concept in concept_list[['name', 'ts_code']].groupby('name').agg(list).reset_index().values
    if len(concept[1]) > 1
]
concept_edges[4]

{'node': ['000821',
  '002079',
  '002129',
  '002218',
  '002350',
  '002610',
  '600438',
  '600546',
  '601137',
  '603396'],
 'edge': 'HIT电池'}

In [17]:
### Business: Main Business ###

if not save:
    fina_mainbz = pd.read_feather('raw/fina_mainbz.ftr')
else:
    fina_mainbz = pd.read_csv('raw/fina_mainbz.csv')
    fina_mainbz.ts_code = fina_mainbz.ts_code.apply(lambda x: x[:6])
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('\(|\)|（|）|-|\"|、|:|#|\'| |/|%|″|、|\u3000', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('主营|其他|其它|地区|区域|销售|服务|收入|产品|行业|产业|业务|企业|及|代理|托管|受托', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('业$|手续费|利息|类$|.*[子母]公司.*', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('^\d*$', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('.*[重组及之]*前.*|公司$|相关$|系列$|与', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('[东南西北境区以山广除内外华中国大陆]*$', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('^[行工商光物劳云农类新老印种副化二一省热辅管果内门专线实产海防和险三I政附会企态旅港保性KV梁财洲]$', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('[东南西北区以除内陕外华中国大陆美江河海上下京]*$', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('[非青韩浙德]$', '', regex=True)
    fina_mainbz.bz_item = fina_mainbz.bz_item.str.replace('内部抵销|佣金', '', regex=True)
    fina_mainbz = fina_mainbz.query('bz_item != ""').reset_index(drop=True)
    fina_mainbz.to_feather('raw/fina_mainbz.ftr')

fina_mainbz_edges = [
    {'node': mainbz[1], 'edge': mainbz[0]} 
    for mainbz in fina_mainbz[['ts_code', 'bz_item']].drop_duplicates().groupby('bz_item').agg(list).reset_index().values
    if len(mainbz[1]) > 1
]
fina_mainbz_edges[0]

{'node': ['002211', '603260'], 'edge': '107胶'}

In [18]:
### Location: Province and city ###

province_edges = [
    {'node': province[1], 'edge': province[0]} 
    for province in stock_info_list[['ts_code', 'province']].groupby('province').agg(list).reset_index().values
    if len(province[1]) > 1
]
city_edges = [
    {'node': city[1], 'edge': city[0]} 
    for city in stock_info_list[['ts_code', 'city']].groupby('city').agg(list).reset_index().values
    if len(city[1]) > 1
]
city_edges[1]

{'node': ['000663', '000732', '002110', '002679'], 'edge': '三明市'}

In [19]:
%%time
### Knowledge Graph ###

if not save:
    G = nx.read_gpickle('knowledge_graph.pkl')
else:
    G = nx.Graph()
    stock_info_dict = stock_info_list[['ts_code', 'name', 'industry', 'area']].set_index('ts_code').to_dict(orient='index')
    G.add_nodes_from([(stock, stock_info_dict[stock]) for stock in stock_info_list.ts_code.values])
    edge_sets = [
        (city_edges, 'city'), 
        (province_edges, 'province'), 
        (industry_edges, 'industry'), 
        (concept_edges, 'concept'), 
        (fina_mainbz_edges, 'mainbz'), 
        (stk_holders_edges, 'investor'), 
        (stk_managers_edges, 'management')
    ]

    for edge_set, set_name in edge_sets:
        for edge in edge_set:
            for n1, n2 in itertools.combinations(edge['node'], 2):
                if n2 not in G[n1]:
                    G.add_edge(n1, n2)
                if set_name not in G[n1][n2]:
                    G[n1][n2][set_name] = [edge['edge']]
                else:
                    G[n1][n2][set_name] = G[n1][n2][set_name]+[edge['edge']]
    nx.write_gpickle(G, 'knowledge_graph.pkl')
                
print(G.number_of_nodes(), G.number_of_edges())

2875 2296568
CPU times: user 8.38 s, sys: 577 ms, total: 8.96 s
Wall time: 9.01 s


In [20]:
G['000001']['000002']

{'city': ['深圳市'],
 'province': ['广东'],
 'concept': ['MSCI概念', '标普道琼斯A股', '沪深300样本股', '深股通', '融资融券'],
 'mainbz': ['广州', '深圳'],
 'investor': ['中国证券金融股份有限公司', '中央汇金资产管理有限责任公司', '香港中央结算有限公司(陆股通)'],
 'management': ['孙建一', '康典']}