In [None]:
import sqlalchemy as sq
import pymysql

import pandas as pd
import numpy as np

import os
import csv

import datetime as dt

sq.__version__

In [None]:

def convert_unixtime(stamp):
    return dt.datetime.fromtimestamp(
        int(stamp)
    ).strftime('%Y-%m-%d')

In [None]:
# Set toggle to anonymize data

anonymize_toggle = False

In [None]:
# Set hash function to anonymize data
import hashlib

SALT = os.urandom(16)

def anonymize(value):
    if anonymize_toggle:
        return hashlib.sha1(value+SALT).hexdigest()
    else:
        return value

In [None]:
def find_dept(email):
    ampersand = email.find('@')
    tail = email[ampersand + 1:]
    try:
        return dept_dict[tail]
    except KeyError:
        return "OTHER"

## Set up Dept List/Dict

In [None]:
dept_dict = {}

data_path = r'/Users/toferc/Documents/Data/'
output_path = r'/Users/toferc/Documents/Data/'

In [None]:
with open(os.path.join(data_path, 'csv_keys.csv'), "r") as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        email, acronym = row
        dept_dict[email] = acronym

dept_dict['cadets.gc.ca'] = 'CADETS'
dept_dict['canada.gc.ca'] = 'CANADA'
dept_dict['canada.ca'] = 'CANADA'
dept_dict['tribunal.gc.ca'] = 'TRIBUNAL'
dept_dict['cannor.gc.ca'] = 'CED/DEC'
dept_dict['ci-oic.gc.ca'] = 'CI/OIC'
dept_dict['ccgs-ngcc.gc.ca'] = 'CCGS/NGCC'
dept_dict['god.ccgs-ngcc.gc.ca'] = 'CCGS/NGCC'
dept_dict['clo-ocol.gc.ca'] = 'OCOL/CLO'
dept_dict['csps.gc.ca'] = 'CSPS/EFPC'
dept_dict['interenational.gc.ca'] = 'DFAITD/MAECD'
dept_dict['cnb-ncw.gc.ca'] = 'CNB/NCW'
dept_dict['ncw-cnb.gc.ca'] = 'CNB/NCW'
dept_dict['nfb.gc.ca'] = 'NFB/ONF'
dept_dict['nrccan-rncan.gc.ca'] = 'NRCAN/RNCAN'
dept_dict['nserc-crsng.gc.ca'] = 'NSERC/CRSNG'
dept_dict['pbc-clcc.gc.ca'] = 'PBC/CLCC'
dept_dict['pco.bcp.gc.ca'] = 'PCO/BCP'
dept_dict['pipsc.ca'] = 'PIPSC/IPFPC'
dept_dict['ps.sp.gc.ca'] = 'PS/SP'
dept_dict['servicecanada.gc.ca.gc.ca'] = 'HRSDC/RHDSC'
dept_dict['fintrac-canafe.gc.ca'] = 'FINTRAC'
dept_dict['gmail.com'] = 'GMAIL'
dept_dict['tribunbal.gc.ca'] = 'TRIBUNAL'

In [None]:
dept_list = []

for k, v in dept_dict.items():
    dept_list.append(v)

dept_list = set(dept_list)

## Connect to DB

In [2]:
import getpass

password = getpass.getpass()

In [None]:
# MariahDB = 165
# MYSQL = 117

db_connection = "mysql+pymysql://root:{}@192.168.2.117:3306/elgg112A".format(
    password)

In [None]:
engine = sq.create_engine(db_connection,encoding='latin1', echo=False)

In [None]:
conn = engine.connect()

In [None]:
engine.connect()

In [None]:
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import and_, or_
Session = sessionmaker(bind=engine)

In [None]:
Session.configure(bind=engine)
session = Session()

In [None]:
Base = automap_base()

Base.prepare(engine, reflect=True)

In [None]:
# Set up mappings

Users = Base.classes.elggusers_entity
Groups = Base.classes.elgggroups_entity
Relationships = Base.classes.elggentity_relationships
Entities = Base.classes.elggentities
Objects = Base.classes.elggobjects_entity
MetaData = Base.classes.elggmetadata
MetaStrings = Base.classes.elggmetastrings
Annotations = Base.classes.elggannotations


### Guide to Elgg Entities

Blogs = Entities(subtype=5)
Group_Members = Users(relationship=member)
Discussions = Entities(subtype=7)
Pages = Entities(subtype=10)
Wire = Entities(subtype=17)

Content = Entities(subtype) -> entity_guid
    Elggmetadata(entity_guid) -> name_id, value_id
    Elggmetastrings(name_id OR value_id)
    
#Comments
Blog is container entity - GUID = blog guid

Blog guid = 10
search container for blog guid, return container guid
elggmetadata(container_guid)
Elggmetastrings(name_id OR value_id)

#Skills
user_GUID -> elggmetadata(container_guid) - name_id = 60

In [None]:
# Print list of table names

from sqlalchemy.engine import reflection

insp = reflection.Inspector.from_engine(engine)
print(insp.get_table_names())

In [None]:
# Set up subtype objects of interest

subtypes = {'blogs': 5,
            'discussions': 7,
            'pages': 10,
            'wires': 17,
            'files': 1,
            'images': 19,
            'bookmarks': 8,
            'ideas': 42
           }

subtype_list = "5 7 10 17 1 19 8 42".split()

## Pull of all content

In [None]:
# Take 2 - single query of DB to pull core collaborative content
# Works just fine

results = {}

for e, u, o in session.query(
    Entities, Users, Objects).filter(
    Entities.owner_guid == Users.guid,
    Entities.guid == Objects.guid,
    Entities.subtype.in_(subtype_list)):
    results[e.guid] = [
            e.guid, 
            e.subtype, 
            anonymize(bytes(u.name,'utf-8')),
            find_dept(u.email.lower()),
            convert_unixtime(e.time_created)]

In [None]:
len(results)

## Pull users

In [None]:
users = {}

for e, u in session.query(Entities, Users).filter(
    Entities.guid == Users.guid):
    users[e.guid] = [e.guid, anonymize(bytes(u.name, 'utf-8')),
                 find_dept(u.email), 
                 convert_unixtime(e.time_created),
                  convert_unixtime(u.last_login),
                ]

In [None]:
# Pull User Colleague Information
# Not using this - complicates the graph

colleagues = []

for r in session.query(Relationships).filter(
    Relationships.relationship == 'friend'):
        try:
            colleagues.append((
                users[r.guid_one][1], 
                    users[r.guid_two][1], 
                    convert_unixtime(r.time_created)))
        except KeyError:
            pass

In [None]:
# len(colleagues)

In [None]:
colleagues[:2]

## Pull User comments

In [None]:
# Pull User comments
# Could also pull comments for analysis with MetaStrings as ms and ms.text

comments = []

for a, e, u in session.query(Annotations, Entities, Users).filter(
    Entities.guid == Annotations.entity_guid,
    Annotations.owner_guid == Users.guid):
        comments.append((
                anonymize(bytes(u.name, 'utf-8')),
                e.guid, 
                convert_unixtime(e.time_created)))

In [None]:
comments[:1]

In [None]:
users[4]

## Create edges from comments to creators

In [None]:
edges = []

for comment in comments:
    user, content, created = comment
    try:
        edges.append([user,
                     results[content][2],
                     created])
    except KeyError:
        pass
    

In [None]:
edges[2:4]

In [None]:
# Combine multiple interactions into a single edge with weigth equal to the number of interactions

edge_dict = {}

for e in edges:
    
    # Remove references when people reply to their own creations
    if e[0] == e[1]:
        pass
    else:
        # Either add +1 weight for duplicate edges or create a new edge
        try:
            edge_dict["{}, {}".format(e[0], e[1])]['weight'] += 1
        except KeyError:
            edge_dict["{}, {}".format(e[0], e[1])] = {'source': e[0], 'target': e[1], 'weight': 1, 'date': e[2]}
        

## Network Graphing

In [None]:
import networkx as nx

In [None]:
G = nx.DiGraph()

In [None]:
for user in users:
    guid, name, department, joined, last_login = users[user]
    G.add_node(name,
               department=department, 
               joined=joined,
              last_login=last_login)

In [None]:
'''for edge in colleagues:
    user, friend, connected = edge
    
    G.add_edge(user, 
               friend, 
               date=connected, 
               weight=1,
               description='colleague')'''

In [None]:
for e in edge_dict:
    G.add_edge(edge_dict[e]['source'], 
               edge_dict[e]['target'], 
               weight=edge_dict[e]['weight'], 
               date=edge_dict[e]['date'],
              description='content')

In [None]:
nx.write_gexf(
    G, os.path.join(
        data_path, "gcconnex_users_content_connections_{}.gexf".format(dt.date.today())))