In [1]:
import pandas as pd
import pylab as pl
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import re
import networkx as nx
import csv
import json

mpl_fig = plt.figure()

In [2]:
data_path = r"C:/Users/Owner/Documents/Work_transfer/Data/GCconnex_collab/"
gephi_path = r"C:/Users/Owner/Documents/Work_transfer/Gephi/GCconnex/"
bp_path = r"C:/Users/Owner/Documents/Work_transfer/Data/GCconnex/"

In [3]:
dept_emails = []
edges = []
details = {}
errors = []
members = []

In [4]:
#Importing a dataset that contains solely BP2020 group members

with open(os.path.join(bp_path+'BP2020 Users.csv'), "r") as f:
    reader = csv.reader(f, delimiter = ',')
    next(reader)

    for row in reader:
        guid = int(row[1])
        members.append(guid)
       

## Create Nodes from users

In [5]:
# Read CSV of UID and names and create nodes for each user

with open(os.path.join(data_path+"gcc_user_info_update_2015-09-03.csv"), "r") as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)

    for row in reader:
        uid, email, start_date = row
        if uid in members:
            ampersand = email.find("@")
            name = email[0:ampersand]
            department = email[ampersand + 1:]
            details[uid] = {'label':name, 'department':department, 'active': "", 'end': '01-09-2015', 'start': start_date}
            dept_emails.append(department.lower())


### Clean & Filter Department Names

In [6]:
dept_emails = set(dept_emails)

In [7]:
dept_acr = [x.split('.') for x in dept_emails]

In [8]:
dept_acr = [x[0] for x in dept_acr]


In [9]:
dept_acr = set(dept_acr)

In [10]:
dept_acr

set()

In [17]:
len(members)

5176

### Blogs

In [6]:
# Read CSV blogs and create edges for connections

with open(data_path+"gccResults_blogs.csv", 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        if len(row) > 1:
            blog_data = row[0].split(' ')
            blog, author, date_created, extra = blog_data

            for comment in row[1:]: # Get comments (2+ columns)
                comment_data = comment.split()
                blog, commenter, comment_date = comment_data
                edges.append([commenter, author, comment_date, 'blog'])
            

In [7]:
 # Users who comment on the same blog are associated with each other
'''for user in associations:
    for target in associations:
        if user != target:
            edges.append([user, target, date_created, 'blog'])'''
            

"for user in associations:\n   for target in associations:\n       if user != target:\n           edges.append([user, target, date_created, 'blog'])"

### Ideas

In [8]:
# Read CSV blogs and create edges for ideas

with open(data_path+"gccResults_ideas.csv", 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        
        if len(row) > 1:
            idea_data = row[0].split(' ')
            idea, author, date_created, extra = idea_data

            for comment in row[1:]: # Get comments (2+ columns)
                comment_data = comment.split()
                comment_id, idea, commenter = comment_data
                edges.append([commenter, author, date_created, 'idea'])
            

### Pages

In [9]:
# Read CSV blogs and create edges for pages

with open(data_path+"gccResults_pages.csv", 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        if len(row) > 1:
            page_data = row[0].split(' ')
            page, author, date_created, extra = page_data

            for comment in row[1:]: # Get comments (2+ columns)
                comment_data = comment.split()
                comment_id, page, commenter = comment_data
                edges.append([commenter, author, date_created, 'page'])

### Discussions

In [10]:
# Read CSV discussion and create edges for discussions

with open(data_path+"gccResults_topics.csv", 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        if len(row) > 1:
            discussion_data = row[0].split(' ')
            discussion, author, date_created, extra = discussion_data

            for comment in row[1:]: # Get comments (2+ columns)
                comment_data = comment.split()
                discussion, commenter, comment_date = comment_data
                edges.append([commenter, author, comment_date, 'discussion'])

In [11]:
# Read CSV discussion and create edges for colleagues

with open(data_path+"gccResults_colleagues.csv", 'r') as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        if len(row) > 1:
            colleague_data = row[0].split(' ')
            colleague = colleague_data[0]
            email = colleague_data[1]

            for c in row[1:]: # Get comments (2+ columns)
                data = c.split()
                source, target, date = data
                edges.append([source, target, date, 'colleague'])

In [11]:
edge_dict = {}

for e in edges:
      if int(e[0]) in members and int(e[1]) in members and e[0] != e[1]:
                try:
                    edge_dict["{}, {}".format(e[0], e[1])]['weight'] += 1
                except KeyError:
                    edge_dict["{}, {}".format(e[0], e[1])] = {'source': e[0], 'target': e[1], 'weight': 1, 'date': e[2]}


In [9]:
# Combine multiple interactions into a single edge with weigth equal to the number of interactions

edge_dict = {}

for e in edges:
    
    # Remove references when people reply to their own creations
    if e[0] == e[1]:
        pass
    else:
        # Either add +1 weight for duplicate edges or create a new edge
        try:
            edge_dict["{}, {}".format(e[0], e[1])]['weight'] += 1
        except KeyError:
            edge_dict["{}, {}".format(e[0], e[1])] = {'source': e[0], 'target': e[1], 'weight': 1, 'date': e[2]}
        

In [12]:
G=nx.DiGraph()

In [13]:
for k,v in details.items():
    G.add_node(k, label=v['label'], department=v['department'], activity=v['active'], start=v['start'], end=v['end'])
    
for e in edge_dict:
    G.add_edge(edge_dict[e]['source'], edge_dict[e]['target'], weight=edge_dict[e]['weight'], date=edge_dict[e]['date'])

In [14]:
len(edge_dict)

5059

In [15]:
# Export to JSON format
from networkx.readwrite import json_graph

d = json_graph.node_link_data(G)
json.dump(d, open('/Users/Owner/Documents/Work_transfer/Network Graph/force/GC_network_activity.json', 'w'))

In [16]:
# Write to Graphml format
nx.write_graphml(G,gephi_path + "GCconnex_BP2020_Users_idbp.graphml")

### Now open up Gephi and play with your data!