In [None]:
import sqlalchemy as sq
import pymysql

import pandas as pd
import numpy as np

import os
import csv

import datetime as dt

sq.__version__

In [None]:

def convert_unixtime(stamp):
    return dt.datetime.fromtimestamp(
        int(stamp)
    ).strftime('%Y-%m-%d')

## Set up Dept List/Dict

In [None]:
dept_dict = {}

data_path = '/home/toferc/Documents/Data/'
output_path = '/home/toferc/Documents/Data/'

In [None]:
with open(os.path.join(data_path, 'csv_keys.csv'), "r") as f:
    reader = csv.reader(f, delimiter=',')
    next(reader)
    
    for row in reader:
        email, acronym = row
        dept_dict[email] = acronym

dept_dict['cadets.gc.ca'] = 'CADETS'
dept_dict['canada.gc.ca'] = 'CANADA'
dept_dict['canada.ca'] = 'CANADA'
dept_dict['tribunal.gc.ca'] = 'TRIBUNAL'
dept_dict['cannor.gc.ca'] = 'CED/DEC'
dept_dict['ci-oic.gc.ca'] = 'CI/OIC'
dept_dict['ccgs-ngcc.gc.ca'] = 'CCGS/NGCC'
dept_dict['god.ccgs-ngcc.gc.ca'] = 'CCGS/NGCC'
dept_dict['clo-ocol.gc.ca'] = 'OCOL/CLO'
dept_dict['csps.gc.ca'] = 'CSPS/EFPC'
dept_dict['interenational.gc.ca'] = 'DFAITD/MAECD'
dept_dict['cnb-ncw.gc.ca'] = 'CNB/NCW'
dept_dict['ncw-cnb.gc.ca'] = 'CNB/NCW'
dept_dict['nfb.gc.ca'] = 'NFB/ONF'
dept_dict['nrccan-rncan.gc.ca'] = 'NRCAN/RNCAN'
dept_dict['nserc-crsng.gc.ca'] = 'NSERC/CRSNG'
dept_dict['pbc-clcc.gc.ca'] = 'PBC/CLCC'
dept_dict['pco.bcp.gc.ca'] = 'PCO/BCP'
dept_dict['pipsc.ca'] = 'PIPSC/IPFPC'
dept_dict['ps.sp.gc.ca'] = 'PS/SP'
dept_dict['servicecanada.gc.ca.gc.ca'] = 'HRSDC/RHDSC'
dept_dict['fintrac-canafe.gc.ca'] = 'FINTRAC'
dept_dict['gmail.com'] = 'GMAIL'
dept_dict['tribunbal.gc.ca'] = 'TRIBUNAL'

In [None]:
def find_dept(email):
    ampersand = email.find('@')
    tail = email[ampersand + 1:]
    try:
        return dept_dict[tail]
    except KeyError:
        return "OTHER"

In [None]:
dept_list = []

for k, v in dept_dict.items():
    dept_list.append(v)

dept_list = set(dept_list)

In [None]:
# Load CIOC members and FTE counts

cioc_depts = pd.read_csv(os.path.join(data_path, 'CIOC_depts_jan_2016.csv'),
                        thousands=',')

In [None]:
cioc_depts.describe()

## Connect to DB

In [None]:
import getpass

password = getpass.getpass('Enter Password: ')

In [None]:
# MariahDB = 165
# MYSQL = 117

db_connection = "mysql+pymysql://gctoolsdata:{}@192.168.1.99:3306/elgg".format(
    password)

In [None]:
engine = sq.create_engine(db_connection,encoding='latin1', echo=False)

In [None]:
conn = engine.connect()

In [None]:
engine.connect()

In [None]:
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import and_, or_
Session = sessionmaker(bind=engine)

In [None]:
Session.configure(bind=engine)
session = Session()

In [None]:
Base = automap_base()

Base.prepare(engine, reflect=True)

In [None]:
# Set up mappings

Users = Base.classes.elggusers_entity
Groups = Base.classes.elgggroups_entity
Relationships = Base.classes.elggentity_relationships
Entities = Base.classes.elggentities
Objects = Base.classes.elggobjects_entity
MetaData = Base.classes.elggmetadata
MetaStrings = Base.classes.elggmetastrings
Annotations = Base.classes.elggannotations


### Guide to Elgg Entities

Blogs = Entities(subtype=5)
Group_Members = Users(relationship=member)
Discussions = Entities(subtype=7)
Pages = Entities(subtype=10)
Wire = Entities(subtype=17)

Content = Entities(subtype) -> entity_guid
    Elggmetadata(entity_guid) -> name_id, value_id
    Elggmetastrings(name_id OR value_id)
    
#Comments
Blog is container entity - GUID = blog guid

Blog guid = 10
search container for blog guid, return container guid
elggmetadata(container_guid)
Elggmetastrings(name_id OR value_id)

#Skills
user_GUID -> elggmetadata(container_guid) - name_id = 60

In [None]:
# Set up subtype objects of interest

subtypes = {'blogs': 5,
            'discussions': 7,
            'pages': 10,
            'wires': 17,
            'files': 1,
            'images': 19,
            'bookmarks': 8,
            'ideas': 42
           }

subtype_list = "5 7 10 17 1 19 8 42".split()

## Pull Blogs

In [None]:
# Test for pulling out blog info
# Cut out: entity.guid, entity.subtype, user.name, objects.title, 

blogs = []

for entity, objects in session.query(
    Entities, Objects).filter(
        Entities.subtype == 5,
        Objects.guid == Entities.guid):
    blogs.append((objects.guid, objects.title, objects.description))

In [None]:
tags = []

for entity, data, strings in session.query(
    Entities, MetaData, MetaStrings).filter(
        Entities.subtype == 5,
        Entities.guid == MetaData.entity_guid).filter(
        or_ (MetaStrings.id == MetaData.value_id,
        MetaStrings.id == MetaData.name_id)):
    tags.append((entity.guid, data.name_id, data.value_id, strings.id,
                 strings.string))

In [None]:
tags[:10]

In [None]:
# Scripts for sorting tags and linking them to guids

from collections import defaultdict

tag_dict = defaultdict(list)

for data in tags:
    guid, name_id, value_id, string_id, string = data
    if name_id == string_id and string == 'tags':
        tag_dict.setdefault(guid, []).append(value_id)
        
strings = {}
        
for data in tags:
    guid, name_id, value_id, string_id, string = data
    strings[string_id] =  string
    

def replace_string_id(tag_list):
    return [strings.get(t).lower() for t in tag_list]

final_tags = defaultdict(list)

for k, v in tag_dict.items():
    final_tags[k] = replace_string_id(v)


In [None]:
len(tag_dict)

In [None]:
strings = {}
        
for data in tags:
    guid, name_id, value_id, string_id, string = data
    strings[string_id] =  string


In [None]:
len(strings)

In [None]:
tag_dict[1499271]

In [None]:

def replace_string_id(tag_list):
    return [strings.get(t).lower() for t in tag_list]

'''for element in tag_dict[1499271]:
    print(element)
    tag_dict[element] = strings.get(tag)'''

In [None]:
replace_string_id(tag_dict[1499271])

In [None]:
final_tags = defaultdict(list)

for k, v in tag_dict.items():
    final_tags[k] = replace_string_id(v)

In [None]:
final_tags

In [None]:
len(final_tags)

In [None]:
# Count the frequency of each tag from our text

tag_frequency = defaultdict(int)

for item in final_tags:
    for tag in final_tags[item]:
        tag_frequency[tag] += 1

In [None]:
tag_frequency

In [None]:
tag_freq = pd.DataFrame.from_dict(tag_frequency, orient='index')

In [None]:
tag_freq.columns = ['frequency']
tag_freq.head()

In [None]:
tag_freq.sort_values(by='frequency', inplace=True, ascending=False)

In [None]:
tag_freq.head()

In [None]:
%matplotlib inline

tag_freq.head(50).plot()

In [None]:
tag_freq.to_csv(os.path.join(output_path, 'blog_tags.csv'))

In [None]:
from collections import OrderedDict

In [None]:
# Create a sorted dictionary based on the frequency

sorted_tag_freq = OrderedDict(sorted(tag_frequency.items(),
                                key=lambda kv: kv[1],
                                reverse=True))

In [None]:
sorted_tag_freq

In [None]:
blogs[7]

In [None]:
blogs[11]

## Groups

In [None]:
# Pull groups from DB

groups = []

for group, entity in session.query(
    Groups, Entities).filter(
        Entities.guid == Groups.guid):
    groups.append((group.guid, group.name, group.description,
                  entity.owner_guid, convert_unixtime(entity.time_created)))

In [None]:
groups[:20]

In [None]:
# Pull tags from DB

group_tags = []

for group, entity, data, strings in session.query(
    Groups, Entities, MetaData, MetaStrings).filter(
        Groups.guid == Entities.guid,
        Entities.guid == MetaData.entity_guid).filter(
        or_ (MetaStrings.id == MetaData.value_id,
        MetaStrings.id == MetaData.name_id)):
    group_tags.append((entity.guid, data.name_id, data.value_id, strings.id,
                 strings.string))

In [None]:
group_tags

In [None]:
# Scripts for sorting tags and linking them to guids

from collections import defaultdict


In [None]:
tag_dict = defaultdict(list)

for data in group_tags:
    guid, name_id, value_id, string_id, string = data
    if name_id == string_id and string == 'interests':
        tag_dict.setdefault(guid, []).append(value_id)

In [None]:
strings = {}
        
for data in group_tags:
    guid, name_id, value_id, string_id, string = data
    strings[string_id] =  string

In [None]:
def replace_string_id(tag_list):
    return [strings.get(t).lower() for t in tag_list]


In [None]:
final_tags = defaultdict(list)

for k, v in tag_dict.items():
    final_tags[k] = replace_string_id(v)


In [None]:
final_tags

In [None]:
# Associate groups and tags

class GCconnexGroup(object):
    
    def __init__(self, name, description, owner, created, tags):
        self.name = name
        self.description = description
        self.owner = owner
        self.created = created
        self.tags = tags
    
    def info(self):
        print('''
        Name: {name}
        Description: {}
        Owner: {}
        Date Created: {}
        Tags: {}'''.format(self.name, self.description, self.owner,
                          self.created, self.tags))


In [None]:
from bs4 import BeautifulSoup

In [None]:
group_info = []

for group in groups:
    guid, name, description, owner_guid, created = group
    name = BeautifulSoup(name, "lxml")
    description = BeautifulSoup(description, "lxml")
    tags = final_tags.get(guid, "None")
    group_info.append([name.text, description.text, owner_guid, created, tags])

In [None]:
len(group_info)

In [None]:
group_df = pd.DataFrame(group_info)

In [None]:
group_df.head()

### Latent Dirichlet Allocation (LDA)

In [None]:
import logging
import gensim
import bz2
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from gensim.utils import simple_preprocess, lemmatize
from gensim.parsing.preprocessing import STOPWORDS as STOPWORDS

from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize

tokenizer = RegexpTokenizer(r'\w+')

In [None]:
# Other method for stopwords - not using here.

from nltk.corpus import stopwords
import nltk

# create English stop word list
en_stops = set(stopwords.words('english'))
fr_stops = set(stopwords.words('french'))

public_service_stops = '''public service canada work http 
https travail gcconnex url'''.split()

In [None]:
public_service_stops

In [None]:
def tokenize(text):
    return [token for token in gensim.utils.simple_preprocess(text)
            if token not in STOPWORDS if token not in fr_stops
           if token not in public_service_stops if len(token) > 3]

In [None]:
print(tokenize(blogs[1][2]))

In [None]:
#meaningful_words = [w for w in tokens if not w in en_stop if not w in fr_stop]

In [None]:
#print(meaningful_words)

In [None]:
from nltk.stem.porter import PorterStemmer

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [None]:
# Didn't end up going with this.

texts = [p_stemmer.stem(i) for i in meaningful_words]
print(texts)

In [None]:
# Set up loop to do this for all blogs
from bs4 import BeautifulSoup

texts = []

def blog_to_words(raw_blog):/home/toferc/Documents
    clean_blog = BeautifulSoup(raw_blog[2], "lxml")
    #prep_blog = clean_blog.get_text().lower()
    #tokens = tokenizer.tokenize(prep_blog)
    #meaningful_words = [w for w in tokens if not w in en_stops 
                        #if not w in fr_stops]
    #text = [p_stemmer.stem(i) for i in meaningful_words]
    texts.append(tokenize(clean_blog.get_text()))

In [None]:
for i, raw_blog in enumerate(blogs):
    blog_to_words(raw_blog)
    if (i+1)%500 == 0:
        print("Converted {} of {} blogs.".format(i+1, len(blogs)))
    
print("Done!")

In [None]:
# Went to bed here
texts[9422]

In [None]:
from gensim import corpora, models

dictionary = corpora.Dictionary(texts) # could include prune_at=2000

In [None]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [None]:
dictionary.save('gcconnex_blogs_dictionary_data')

In [None]:
# Generate the LDA model for our blog corpus

ldamodel = models.ldamulticore.LdaMulticore(corpus, num_topics=30,
                                           id2word = dictionary,
                                            chunksize=1000,
                                            passes=1)


In [None]:
ldamodel.print_topics(num_topics=20, num_words=3)

In [None]:
# Consider trying Kaggle.com word2vec tutorial

In [None]:
ldamodel.top_topics(corpus, num_words=8)

In [None]:
from pprint import pprint

In [None]:
from collections import defaultdict, OrderedDict

In [None]:
frequency = defaultdict(int)

In [None]:
# Remove words that occur less than 5 times and than have less than 3 letters

texts = [[token for token in text if frequency[token] > 5 if len(token) > 3]
         for text in texts]

In [None]:
# Count the frequency of each token from our text

for text in texts:
    for token in text:
        frequency[token] += 1

In [None]:
# Create a sorted dictionary based on the frequency

sorted_freq = OrderedDict(sorted(frequency.items(),
                                key=lambda kv: kv[1],
                                reverse=True))

In [None]:
sorted_freq

In [None]:
tfidf_model = models.tfidfmodel.TfidfModel(
    corpus)

In [None]:
with open('blogs.csv', 'w+', encoding='latin-1') as f:
    for blog in blogs:
        f.write(blog)


In [None]:
# Prep for NLTK analysis

full_text = "\n".join(blogs)


tokens = word_tokenize(full_text)
text = nltk.Text(tokens)
sens = nltk.sent_tokenize(raw)

In [None]:
freq_long(text)

In [None]:
pos_trigrams(text)