# Using machine learning to classify the community of GCconnex blogs 
### (and other content)

Enter more details here

In [None]:
# Imports for basic Python

import sqlalchemy as sq
import pymysql

from collections import OrderedDict, defaultdict

import pandas as pd
import numpy as np
import pickle

import os
import csv

import datetime as dt

sq.__version__

In [None]:
# Import Gensim

from gensim import corpora, models
from gensim.utils import simple_preprocess, lemmatize
from gensim.parsing.preprocessing import STOPWORDS as STOPWORDS

from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize

tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

from nltk.corpus import stopwords
import nltk

# create English stop word list
en_stops = set(stopwords.words('english'))
fr_stops = set(stopwords.words('french'))

# Add certain additional stop words
public_service_stops = '''public service canada work http 
https travail gcconnex url'''.split()

# Set up stemmer
stemmer = SnowballStemmer("english")

In [None]:
# Text Preprocessing functions

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def tokenize(text):
    return [lemmatize_stemming(token) for token in tokenizer.tokenize(str(text))
            if token not in STOPWORDS if token not in fr_stops
           if token not in public_service_stops if len(token) > 3]

In [None]:
# IMport SKlearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import metrics

from sklearn.pipeline import Pipeline

In [None]:
# Set data paths

data_path = '/home/chris/data/'
output_path = '/home/chris/data/'

In [None]:
# utility scripts

def save_obj(obj, name):
    with open(os.path.join(data_path, "{}.pkl".format(name)), 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name):
    with open(os.path.join(data_path, "{}.pkl".format(name)), 'rb') as f:
        return(pickle.load(f))

In [None]:
# Loading data
# See SQLAlchemy GCconnex Classifier - April 27 for initial loads

In [None]:
# Load blogs

raw_blogs = pd.DataFrame.from_csv(os.path.join(
    data_path, "gcconnex_blogs_info.csv"))

blog_tokens = load_obj("blog_tokens")

In [None]:
# Load communities data

# dictionary of communities and associated tags
communities = load_obj("communities")

# dictionary of community count for each blog
blog_communities = load_obj("blog_communities")

# list of names for each community
community_names = load_obj("community_names")

# array of binary (0,1) labels for each community for each blog
community_labels = load_obj("binarized_labels")

In [None]:
# Load dictionaries

content_dictionary = corpora.Dictionary.load(
    os.path.join(data_path, "gcconnex_blogs_content_dictionary"))

tag_dictionary = corpora.Dictionary.load(
    os.path.join(data_path, "gcconnex_blogs_tags_dictionary"))

community_tags_dictionary = corpora.Dictionary.load(
    os.path.join(data_path, "community_tags_dictionary"))

In [None]:
# Load LDA model(s)

lda_model = load_obj("gcconnex_blogs_lda_model.pkl")

lda_model_tfidf = load_obj("gcconnex_blogs_tfidf_lda_model.pkl")

In [None]:
# Load vectorizers based on the tokenized and lemmatized blogs

count_vectorizer = load_obj("count_vectorizer")

In [None]:
# Load classifiers

multilabel_clf = load_obj("multilabel_clf")

## Testing

In [None]:
blueprint = '''
Blueprint 2020 is a vision for a world-class Public Service equipped to serve Canada and Canadians now and into the future.

With around a quarter-million employees, the Public Service of Canada is the largest employer in the country. We work across more than a hundred departments and agencies, delivering important services to Canadians – from providing old age security and employment insurance benefits to protecting Canadian sovereignty to helping families save for higher education. We are responsible for regulating the safety of food and drugs, undertaking research and development to protect our shared environment, promoting Canada’s national interests around the world, and developing economic, trade and energy policies, among many other duties. We make a difference in the lives of Canadians every day. To ensure continued excellence in public service requires us to always ask how we can improve both our performance and our value to Canadians. The Blueprint 2020 initiative was devised to help us ask these questions, and to allow us to build tomorrow’s Public Service together.

Since June 2013, tens of thousands of public servants have shared their views on what it takes to ensure public service excellence. This input is redefining how we work and is making engagement part of our shared culture.

A team of employees working at the Canada Border Service Agency
By making a space to discuss our passion for public service and to act on our ideas for improvement, we are now well on our way to turning vision into reality. Building the Public Service of the future is a process that requires us all to commit to action, and to dedicate the time and effort necessary to see it through. We are all personally accountable for bringing about real change and realizing our ambitious goals.

For some examples of innovations that have arisen as a result of engaged employees working together to deliver results for Canadians, please check out the Clerk of the Privy Council’s latest Annual Report to the Prime Minister on the Public Service of Canada.

Blueprint 2020 Principles:
The Blueprint 2020 vision is guided by four principles, as outlined in the document Blueprint 2020: Getting Started – Getting Your Views, that help examine how work is done in the federal Public Service:

An open and networked environment that engages citizens and partners for the public good.
A whole-of-government approach that enhances service delivery and value for money.
A modern workplace that makes smart use of new technologies to improve networking, access to data and customer service.
A capable, confident and high-performing workforce that embraces new ways of working and mobilizes the diversity of talent to serve the country’s evolving needs.

'''

In [None]:
asprof = '''About our Administrative Professionals Competency Profiles

The AS Community Initiative has decided to align the four core competencies into a comprehensive document that enables integrated human resources practices. The objective of this approach is to:

• motivate the employees to take charge of their future and better understand what is expected of them

• help managers evaluate the core competencies effectively and fairly through the performance management cycle

• accelerate, improve and shape human resources processes

Content

The definition of a competency is the knowledge, skills, abilities and behaviours that employees use to perform their work.

Each behavioural competency presented in this document has a definition and a progressive scale of effective behaviours. The definition explains what the competency means in general, and the progressive scale has five

different levels that identify the expected behaviours to be demonstrated by the Treasury Board of Canada Secretariat’s AS and CR employees. The more you progress in the scale, the more your behaviours require a

broader perspective to take action on more complex situations.

Using the Administrative Professionals Competency Profiles

In order to fully understand the competencies in this document, it is important to first read the definition and then look at the progression of scale. Doing this will give you a complete picture of what is expected for each

competency.

There are two components to a competency: definition and scale. The definition explains what the competency means. This explanation provides a common language that everyone in the department can use. Each

competency also has a progressive scale, which is divided into five levels with a description of what behaviours are expected throughout. Each competency scale is cumulative, which means that, although behaviours at

lower levels are not repeated at higher levels, they nonetheless apply. As you progress through the scale, the expected behaviours grow from reactive to strategic. A reactive behaviour means that the employee is

responsive to a situation and may be prompted by someone else, such as a supervisor or a client. A strategic behaviour takes into consideration a broader scope in order to plan and take proactive action in a complex

situation.

NOTE: These are general guidelines. AS and CR employees perform a wide range of duties. Therefore, the levels indicated will vary depending on the position. For example, some AS-04 positions

may, in fact, require a combination of behaviours that have been linked to the AS-03 and AS-5 levels in this document. However, you can still use the competency profile to identify what would be the

next logical behavior for you to work on.

Questions

If you have any questions related to the understanding of these Administrative Professionals Competencies Profiles, we invite you to send an email to the AS-Initiative- AS mailbox.'''

In [None]:
# Keyword extraction using slightly modified nltk_rake

import nltk_rake

rake = nltk_rake.RakeKeywordExtractor()

In [None]:
keywords = rake.extract(asprof, incl_scores=True)

In [None]:
keywords[0:5]

In [None]:
# manual pre-processing of text - Need to build this into pipeline

bp_tokens = tokenize(asprof)
bp_tokens = " ".join(bp_tokens)

bp_tokens_count = count_vectorizer.transform([bp_tokens])
bp_tokens_count

In [None]:
predict_prob = multilabel_clf.predict_proba(bp_tokens_count)
predict = multilabel_clf.predict(bp_tokens_count)

This gives us a strong read along several of the communities.

In [None]:
# Script to generate simple community outputs

def predict_communities(predict_array):
    for i, element in enumerate(np.nditer(predict_array)):
        print("{}: {}".format(community_names[i], element))

In [None]:
# Communities with probabilities

predict_communities(predict_prob)

In [None]:
# Communities with binary tags

predict_communities(predict)

## Pipeline

Now to use the pipeline we developed.

In [None]:
# Set up pipeline to tokenize, vectorize and classify data

classification_pipeline = Pipeline([
    ('vectorizer', count_vectorizer),
    ('clf', multilabel_clf),
])

In [None]:
predict = classification_pipeline.predict([blueprint])

In [None]:
predict_communities(predict)

Of note, because we aren't doing the advanced tokenization, the pipeline is showing much weaker signals for each of the communities.  We need to either embed our specific tokenization (or TFidf) into the pipeline or set up a custom job.

## Deep learning

Set up for using Keras and developing a deep learning algorithm for prediction

In [None]:
# Import Keras libraries

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils

In [None]:
# Need to build a better model here - CNN
# alternating dense and non-linear layers

model = Sequential()
model.add(Dense(15, input_dim=51658, activation="relu", kernel_initializer="normal"))
model.add(Dense(15, activation="sigmoid", kernel_initializer="normal"))

# Compile model
print("Compiling model...")
model.compile(loss='categorical_crossentropy', optimizer='adam',
             metrics=['accuracy'])

In [None]:
# Need to pull in the raw data here to do the training on

hist = model.fit(X, train_target, validation_split=0.2)
print("")
print(hist.history)