In [1]:
import pandas as pd
import networkx as nx

__author__ = "Yang"

In [6]:
# Garbage, research&development, human resource, finance
filter = ['新葡京', '釦', '扣', 'QQ:']
rd_key = ['ALARM', 'RECOVER', 'Emerg', '警', '文档', '项目', '崩',
          '系统', '设计', '台', '需', '端', '测', '置', '技术', '汇', '段']
hr_key = ['简历', '资料', '通知', '总结', 'Offer', '岗位',
          '考勤', '员', '候', '内', '迟', '旷', '早', '福']
fi_key = ['财务', '资金', '报销', '会计', '税']


def add_node_edge(graph, frm, to, c):
    '''
    Add node and edge [from, to] to graph with color c
    return graph
    '''
    graph.add_node(frm)
    for i in to.split(';'):
        graph.add_node(i)
        graph.add_edge(
            frm, i, viz={'color': {'r': c[0], 'g': c[1], 'b': c[2], 'a': 1.0}})
    return graph


def init_graph():
    email = nx.MultiGraph()
    rd = nx.MultiGraph()  # Research and development department
    hr = nx.MultiGraph()  # Human resource department
    fi = nx.MultiGraph()  # Finance department
    email.clear(); rd.clear; hr.clear(); fi.clear()
    return email, rd, hr, fi

def read_csv(path):
    '''
    Read .csv file in path
    return a dataframe and number of rows
    '''
    try:
        df = pd.read_csv(path, encoding='gbk')
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding='utf-8')
    num = df.shape[0]
    return df, num

In [7]:
def generate_graph(path, l: list):
    '''
    generate graph in l
    return graph
    '''
    df, num = read_csv(path)
    for i in range(num):
        # Filter garbage
        if any(j in df['subject'][i] for j in filter):
            pass
        # Add to RD with red edge
        elif any(j in df['subject'][i] for j in rd_key):
            l[1] = add_node_edge(l[1], df['from'][i], df['to'][i], (255, 0, 0))
            l[0] = add_node_edge(
                l[0], df['from'][i], df['to'][i], (255, 0, 0))
        # Add to HR with green edge
        elif any(j in df['subject'][i] for j in hr_key):
            l[2] = add_node_edge(l[2], df['from'][i], df['to'][i], (0, 128, 0))
            l[0] = add_node_edge(
                l[0], df['from'][i], df['to'][i], (0, 128, 0))
        # Add to Fi with blue edge
        elif any(j in df['subject'][i] for j in fi_key):
            l[3] = add_node_edge(l[3], df['from'][i],
                                 df['to'][i], (65, 105, 225))
            l[0] = add_node_edge(
                l[0], df['from'][i], df['to'][i], (65, 105, 225))
        else:
            l[0] = add_node_edge(
                l[0], df['from'][i], df['to'][i], (250, 128, 114))
    return l[0], l[1], l[2], l[3]

In [8]:
def write2gexf(path, l: list, suffix):
    '''
    Write graph in l to .gexf file
    '''
    nx.write_gexf(l[0], path + 'email' + suffix)
    nx.write_gexf(l[1], path + 'rd' + suffix)
    nx.write_gexf(l[2], path + 'hr' + suffix)
    nx.write_gexf(l[3], path + 'fi' + suffix)

In [9]:
# generate graph for day 1
path_d1 = 'rawdata/2017-11-01/email.csv'
email_d1, rd_d1, hr_d1, fi_d1 = init_graph()
email_d1, rd_d1, hr_d1, fi_d1 = generate_graph(
    path_d1, [email_d1, rd_d1, hr_d1, fi_d1])
write2gexf('preprocessed data/', [email_d1, rd_d1, hr_d1, fi_d1], '_d1.gexf')

In [10]:
# generate graph for all
path_all = 'preprocessed data/email_all.csv'
email_all, rd_all, hr_all, fi_all = init_graph()
email_all, rd_all, hr_all, fi_all = generate_graph(
    path_all, [email_all, rd_all, hr_all, fi_all])
write2gexf('preprocessed data/', [email_all,
                                  rd_all, hr_all, fi_all], '_all.gexf')

In [11]:
print(email_d1.number_of_nodes())
print(email_d1.number_of_edges())
print(email_all.number_of_nodes())
print(email_all.number_of_edges())

476
5087
4206
108246
