This notebook is to develop a prototype classifier based on the MonkeyLearn blog post using GCconnex blog data.  Copy made of LDA-Blogs.

https://blog.monkeylearn.com/creating-machine-learning-models-to-analyze-startup-news/

In [None]:
import sqlalchemy as sq
import pymysql

import pandas as pd
import numpy as np

import os
import csv

import datetime as dt

sq.__version__

In [None]:
def convert_unixtime(stamp):
    return dt.datetime.fromtimestamp(
        int(stamp)
    ).strftime('%Y-%m-%d')

In [None]:
def convert_snakecase(text_list):
    
    new_word_list = []
    
    for words in text_list:
        lower_words = words.lower().split()
        text = "_".join(lower_words)
        new_word_list.append(text)
        
    return new_word_list
    

In [None]:
import pickle

def save_obj(obj, name):
    with open(os.path.join(data_path, "{}.pkl".format(name)), 'wb') as f:
        pickle.dump(obj, f)

def load_obj(name):
    with open(os.path.join(data_path, "{}.pkl".format(name)), 'rb') as f:
        return(pickle.load(f))

In [None]:
data_path = '/home/chris/data/'
output_path = '/home/chris/data/'

## Connect to DB and pull info from GCconnex

In [None]:
import getpass

password = getpass.getpass('Enter Password: ')

In [None]:
# MariahDB = 165
# MYSQL = 117

db_connection = "mysql+pymysql://gctoolsdata:{}@192.168.1.99:3306/elgg".format(
    password)

In [None]:
engine = sq.create_engine(db_connection,encoding='latin1', echo=False)

In [None]:
conn = engine.connect()

In [None]:
engine.connect()

In [None]:
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.ext.automap import automap_base
from sqlalchemy import and_, or_
Session = sessionmaker(bind=engine)

In [None]:
Session.configure(bind=engine)
session = Session()

In [None]:
Base = automap_base()

Base.prepare(engine, reflect=True)

In [None]:
# Set up mappings

Users = Base.classes.elggusers_entity
Groups = Base.classes.elgggroups_entity
Relationships = Base.classes.elggentity_relationships
Entities = Base.classes.elggentities
Objects = Base.classes.elggobjects_entity
MetaData = Base.classes.elggmetadata
MetaStrings = Base.classes.elggmetastrings
Annotations = Base.classes.elggannotations


### Guide to Elgg Entities

Blogs = Entities(subtype=5)
Group_Members = Users(relationship=member)
Discussions = Entities(subtype=7)
Pages = Entities(subtype=10)
Wire = Entities(subtype=17)

Content = Entities(subtype) -> entity_guid
    Elggmetadata(entity_guid) -> name_id, value_id
    Elggmetastrings(name_id OR value_id)
    
#Comments
Blog is container entity - GUID = blog guid

Blog guid = 10
search container for blog guid, return container guid
elggmetadata(container_guid)
Elggmetastrings(name_id OR value_id)

#Skills
user_GUID -> elggmetadata(container_guid) - name_id = 60

In [None]:
# Set up subtype objects of interest

subtypes = {'blogs': 5,
            'discussions': 7,
            'pages': 10,
            'wires': 17,
            'files': 1,
            'images': 19,
            'bookmarks': 8,
            'ideas': 42
           }

subtype_list = "5 7 10 17 1 19 8 42".split()

## Pull Users

In [None]:
class UsersObject(object):  # Pulls in the entire users database
    
    def __init__(self):
        pass

    def get_all():  # Grabs entire table

        user_query = session.query(Users).statement

        users = pd.read_sql(user_query, conn)

        users['last_action'] = users['last_action'].apply(convert_unixtime)
        users['prev_last_action'] = users['prev_last_action'].apply(convert_unixtime)
        users['last_login'] = users['last_login'].apply(convert_unixtime)
        users['prev_last_login'] = users['prev_last_login'].apply(convert_unixtime)
        return users

    def filter_department(filter_condition):
        users_session = session.query(Users)
        users = pd.read_sql(
            users_session.filter(
                text("{}".format(filter_condition))
            ).statement, conn
        )

        users['last_action'] = users['last_action'].apply(convert_unixtime)
        users['prev_last_action'] = users['prev_last_action'].apply(convert_unixtime)
        users['last_login'] = users['last_login'].apply(convert_unixtime)
        users['prev_last_login'] = users['prev_last_login'].apply(convert_unixtime)

        return users
    
    def department():  # Issue : doesn't pull all members. That's bad.

        statement = session.query(
            Users.guid,
            Users.name,
            Users.email,
            Users.last_action,
            Users.prev_last_action,
            Users.last_login,
            Users.prev_last_login,
            Entities.time_created,
            MetaStrings.string
        )

        statement = statement.filter(MetaStrings.id == MetaData.value_id)
        statement = statement.filter(MetaData.name_id == 8667)
        statement = statement.filter(MetaData.entity_guid == Users.guid)
        statement = statement.filter(Entities.guid == Users.guid)
        statement = statement.statement

        users_department = pd.read_sql(statement, conn)

        users_department['last_action'] = users_department['last_action'].apply(convert_unixtime)
        users_department['prev_last_action'] = users_department['prev_last_action'].apply(convert_unixtime)
        users_department['last_login'] = users_department['last_login'].apply(convert_unixtime)
        users_department['prev_last_login'] = users_department['prev_last_login'].apply(convert_unixtime)
        users_department['time_created'] = users_department['time_created'].apply(convert_unixtime)
        users_department['organization'] = users_department['string']

        return users_department

In [None]:
users = UsersObject.department()

In [None]:
users.describe()

## Pull Blogs

In [None]:
# Test for pulling out blog info
# Cut out: entity.guid, entity.subtype, user.name, objects.title, 

blogs = []

for entity, objects in session.query(
    Entities, Objects).filter(
        Entities.subtype == 5,
        Objects.guid == Entities.guid):
    blogs.append((objects.guid, objects.title, objects.description))

In [None]:
tags = []

for entity, data, strings in session.query(
    Entities, MetaData, MetaStrings).filter(
        Entities.subtype == 5,
        Entities.guid == MetaData.entity_guid).filter(
        or_ (MetaStrings.id == MetaData.value_id,
        MetaStrings.id == MetaData.name_id)):
    tags.append((entity.guid, data.name_id, data.value_id, strings.id,
                 strings.string))

In [None]:
tags[:10]

## Link tags to blogs and conduct analysis of metadata tags

In [None]:
# Scripts for sorting tags and linking them to guids

from collections import defaultdict

tag_dict = defaultdict(list)

for data in tags:
    guid, name_id, value_id, string_id, string = data
    if name_id == string_id and string == 'tags':
        tag_dict.setdefault(guid, []).append(value_id)
        
strings = {}
        
for data in tags:
    guid, name_id, value_id, string_id, string = data
    strings[string_id] =  string
    

def replace_string_id(tag_list):
    return [strings.get(t).lower() for t in tag_list]

final_tags = defaultdict(list)

for k, v in tag_dict.items():
    final_tags[k] = convert_snakecase(replace_string_id(v))


In [None]:
processed_tags = defaultdict(list)

for k,v in final_tags.items():
    if v == ['']:
        pass
    else:
        processed_tags[k] = v

In [None]:
save_obj(communities, "communities")

In [None]:
# Count the frequency of each tag from our text

from collections import defaultdict

tag_frequency = defaultdict(int)

for item in final_tags:
    for tag in final_tags[item]:
        tag_frequency[tag] += 1

In [None]:
tag_frequency

In [None]:
tag_freq = pd.DataFrame.from_dict(tag_frequency, orient='index')

In [None]:
tag_freq.columns = ['frequency']
tag_freq.head()

In [None]:
tag_freq.sort_values(by='frequency', inplace=True, ascending=False)

In [None]:
tag_freq.head()

In [None]:
%matplotlib inline

tag_freq.head(50).plot()

In [None]:
tag_freq.to_csv(os.path.join('~/data/', 'blog_tags_2017_04_25.csv'))

In [None]:
from collections import OrderedDict

In [None]:
# Create a sorted dictionary based on the frequency

sorted_tag_freq = OrderedDict(sorted(tag_frequency.items(),
                                key=lambda kv: kv[1],
                                reverse=True))

In [None]:
sorted_tag_freq

## Set communities using tags from Information Architecture Review

Martin to enter info here

In [None]:
# Categories of tags for community identification

ATIP = ['access to information', 'atip', 'privacy', 'censorship',
       'confidentiality', 'freedom of information', 'information requests',
       'open government', 'data breach', 'right to privacy', 'right to privacy',
       'security breach', 'personal information']

material_management = ['material management', 'material', 'supply chain management', 'supply chain',
                      'inventories', 'logistics', 'supplies']

procurement_specialists = ['procurement', 'purchasing', 'acquisition', 'government purchasing',
                          'ordering', 'public purchasing', 'buyers', 'consumerism', 'costs',
                          'standing offers', 'supplies']

real_property = ['property', 'real_property', 'real estate', 'realty', 'expropriation', 'property tax',
                'real estate industry', 'real property services']

evaluators = ['evaluators', 'assessment', 'appraisal', 'evaluations' ,'grading',
             'environmental impact assessment', 'eia', 'performance assessment',
             'benchmarks', 'comparison', 'control', 'measurement', 'merit', 'revision', 'testing',
             'pr processess', 'project management', 'program review', 'programs']

communication = ['communication', 'comms', 'communications', 'government communications', 'military communications',
                'telecommunications', 'communications equipment', 'media', 'information and communications',
                'information bulletin', 'press releases', 'outreach', 'engagement']

regulators = ['regulator', 'regulators', 'regulate', 'legislation', 'licensing', 'regulations', 'economic regulations',
             'safety regulations', 'by-laws', 'taxation regulation', 'legislation', 'legislative writing', 'legislative',
             'regulation', 'regulatory agencies', 'regulatory agency', 'price regulation']

financial_officers = ['financial', 'finance', 'finances', 'financial officers', 'financial management', 'international finance',
                     'public finance', 'quarterly financial reports', 'qfa', 'accounting standards', 'budget', 'budgets',
                     'chief financial officer', 'cfo', 'cfos', 'financial analysis', 'financial crisis',
                     'financial management', 'financial services', 'financial statements', 'portfolio',
                     'supplementary estimates', 'financial administration', 'financial planning',
                     'fiscal planning', 'money management', 'budget planning', 'expenditure management',
                     'financial analysis']

information_management = ['information management', 'cataloguing', 'data processing', 'information',
                         'information dissemination', 'information policy', 'information systems', 'knowledge management',
                         'metadata', ' records management']

information_technology = ['information technology', 'technology', 'artificial intelligence', 'computer networks',
                         'electronic data interchange', 'intelligent systems', 'multimedia', 'telecommunications',
                         ]

internal_auditors = ['audit', 'internal audit', 'auditors', 'internal auditors', 'governance', 'review', 'risk management']

security_specialist = ['security', 'security specialists', 'secure', 'computer security',
                      'human security', 'international security', 'national security', 'safety', 'investigations',
                      'safety investigations']

human_resources = ['hr', 'human resources', 'human resource', 'ressources humaines', 'rh', 'personnel',
                  'staff' ,'chief human resources officer', 'labour force', 'personnel management', 'staffing',
                  'workers']

policy = ['policy', 'policy specialists', 'agricultural policy', 'cultural policy', 'defence policy', 'economic policy',
         'education policy', 'energy policy', 'fiscal policy', 'foreign policy', 'environmental policy',
         'fiscal policy', 'fisheries policy', 'food policy', 'foreign policy', 'forestry policy', 'government policy',
         'health policy', 'immigration policy', 'industrial policy', 'investment policy', 'language policy',
         'monetary policy', 'science policy', 'social policy', 'technology policy', 'policy development',
         'policy instruments', 'policy review']

fed_science_tech = ['science', 'technology', 'life science', 'life sciences', 'informatics', 'computer science',
                   'analytics', 'natural sciences', 'geosciences', 'chemistry', 'geography', 'geology', 'hydrology',
                   'hard rocks', 'meteorology', 'scientists' ,'scientist', 'oceanography', 'ecology',
                   'earth sciences', 'medicine', 'pharmacology', 'toxicology', 'biology', 'mathematics', 'math',
                   'statistics', 'physics', 'social science', 'social sciences', 'astronomy']

services = ['secretariat', 'technical services', 'services', 'client', 'service provider', 'client satisfaction',
           'user experience', 'design thinking', 'graphical user interface', 'service levels', 'conversions']


# Convert all community tags to snakecase

'''ATIP = convert_snakecase(ATIP)
material_management = convert_snakecase(material_management) 
procurement_specialists = convert_snakecase(procurement_specialists) 
real_property = convert_snakecase(real_property) 
evaluators = convert_snakecase(evaluators) 
communication = convert_snakecase(communication) 
regulators = convert_snakecase(regulators) 
financial_officers = convert_snakecase(financial_officers) 
information_management = convert_snakecase(information_management) 
information_technology = convert_snakecase(information_technology) 
internal_auditors = convert_snakecase(internal_auditors) 
security_specialist = convert_snakecase(security_specialist) 
human_resources = convert_snakecase(human_resources) 
policy = convert_snakecase(policy) 
fed_science_tech = convert_snakecase(fed_science_tech)'''


# Create list of communities to iterate through

communities = {'ATIP' : {'name': 'ATIP', 'tags': ATIP},
               'material_management' : {'name': 'material_management', 'tags': material_management},
               'procurement_specialists' : {'name': 'procurement_specialists', 'tags': procurement_specialists},
               'real_property' : {'name': 'real_property', 'tags': real_property},
               'evaluators' : {'name': 'evaluators', 'tags': evaluators},
               'communication' : {'name': 'communication', 'tags': communication},
               'regulators' : {'name': 'regulators', 'tags': regulators},
               'financial_officers' : {'name': 'financial_officers', 'tags': financial_officers},
               'information_management' : {'name': 'information_management', 'tags': information_management},
               'information_technology' : {'name': 'information_technology', 'tags': information_technology},
               'internal_auditors' : {'name': 'internal_auditors', 'tags': internal_auditors},
               'security_specialist' : {'name': 'security_specialist', 'tags': security_specialist},
               'human_resources' : {'name': 'human_resources', 'tags': human_resources},
               'policy' : {'name': 'policy', 'tags': policy},
               'fed_science_tech' : {'name': 'fed_science_tech', 'tags': fed_science_tech}}

## Clean data from blogs and prepare for processing

In [None]:
# Beautifulsoup to remove HTML tags
# Langdetect to ... detect languages

from bs4 import BeautifulSoup
from langdetect import detect, detect_langs
blog_info = []

for blog in blogs:
    guid, name, description = blog
    name = BeautifulSoup(name, "lxml")
    description = BeautifulSoup(description, "lxml")
    language = detect(description)
    tags = processed_tags.get(guid, "None")
    blog_info.append([guid, name.text, description.text, tags, language])

In [None]:
blog_info[7]

In [None]:
blog_df = pd.DataFrame(blog_info, columns=['guid', 'title', 'content', 'tags'])

In [None]:
blog_df.head()

In [None]:
blog_df.to_csv("gcconnex_blogs_info.csv")

### Break.  Still need to fix and clean tags to communities

In [None]:
# Reload dataframe

blogs = pd.DataFrame.from_csv(os.path.join(data_path, "gcconnex_blogs_info.csv"))

In [None]:
blogs.head()

In [None]:
# Reload community dict

#community_dict = load_obj("gcconnex_blogs_communities_dict")

In [None]:
#community_dict[11303]

### Text Pre-Processing

Using Gensim and NLTK to tokenzied, lemmatize and clean text for classification

In [None]:
import logging
import gensim
import bz2

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
from gensim.utils import simple_preprocess, lemmatize
from gensim.parsing.preprocessing import STOPWORDS as STOPWORDS

from nltk.tokenize import RegexpTokenizer
from nltk import word_tokenize

tokenizer = RegexpTokenizer(r'\w+')

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [None]:
# Set up stopwords

from nltk.corpus import stopwords
import nltk

# create French stop word list
fr_stops = set(stopwords.words('french'))

# Add public service specific stopwords - we could expand this, but the algorithms will do
# a lot of that for us

public_service_stops = '''public service canada work http 
https travail gcconnex url'''.split()

In [None]:
'''
Stemming example - need to set up Stemmer for French as well
'''
stemmer = SnowballStemmer("english")
plurals = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in plurals]
print(' '.join(singles))

In [None]:
# Data Pre-processing

def lemmatize_stemming(text):
    # our lemmatizer - is called in tokenize
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def tokenize(text):
    # our tokenizer
    return [lemmatize_stemming(token) for token in tokenizer.tokenize(str(text))
            if token not in STOPWORDS if token not in fr_stops
           if token not in public_service_stops if len(token) > 3]

In [None]:
# Pre-process blog content (takes a while)

blogs['content_tokens'] = blogs.content.apply(tokenize)

In [None]:
# Pre-process blog tags

blogs['tag_tokens'] = blogs['tags'].apply(tokenize)

In [None]:
blogs.head()

In [None]:
from gensim import corpora, models

tag_dictionary = corpora.Dictionary(blogs.tag_tokens) # could include prune_at=2000
content_dictionary = corpora.Dictionary(blogs.content_tokens) # could include prune_at=2000

In [None]:
content_dictionary.save(os.path.join(
    '/home/chris/data/', 'gcconnex_blogs_content_dictionary'))

In [None]:
tag_dictionary.save(os.path.join(
    '/home/chris/data/', 'gcconnex_blogs_tags_dictionary'))

In [None]:
# Have a look at one of our dictionaries (key=#, value=word)

count = 0
for k,v in content_dictionary.items():
    print(k,v)
    count += 1
    if count > 10:
        break


In [None]:
# Remove content that either appears too frequently to matter or too rarely.

content_dictionary.filter_extremes(no_below=15, no_above=0.10)

## Bag-of-Words implementation

In [None]:
# Transform content into bag-of-words

content_bow_corpus = [content_dictionary.doc2bow(blog) for blog in blogs.content_tokens]

In [None]:
content_bow_corpus[17]

In [None]:
'''
Preview BOW for our sample preprocessed document
'''

def preview_bow(doc, dictionary):
    # given a document and dictionary, creates a preview of the bow model
    for i in range(len(doc)):
        print("Word {} (\"{}\") appears {} time.".format(doc[i][0], 
                                                         dictionary[doc[i][0]], 
                                                         doc[i][1]))

In [None]:
# Checking out our bag-of-words

bow_doc_17 = content_bow_corpus[17]
bow_doc_17

In [None]:
# Using preview-BOW to get more human readable output

preview_bow(bow_doc_17, content_dictionary)

In [None]:
# Calculating the overall term frequency and importance using TF-idf

# First we create the model based on the overall corpus

content_tfidf = models.TfidfModel(content_bow_corpus)

In [None]:
# Then we apply the model to each document in the corpus

content_corpus_tfidf = content_tfidf[content_bow_corpus]

In [None]:
'''
Preview TF-IDF scores for our first document --> --> (token_id, tfidf score)
'''

from pprint import pprint
for doc in content_corpus_tfidf:
    pprint(doc)
    break

# Latent variables - LDA

Looking for communities in the overall text based on Latent Dirichlet Analysis

In [None]:
# LDA multicore 

lda_model = gensim.models.LdaMulticore(content_bow_corpus, 
                                       num_topics=20, 
                                       id2word = content_dictionary,
                                       workers=7,
                                       passes = 50)

In [None]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

In [None]:
'''
Define lda model using tfidf corpus
'''
content_lda_model_tfidf = gensim.models.LdaMulticore(content_corpus_tfidf, 
                                             num_topics=20, 
                                             id2word = content_dictionary, 
                                             passes = 50, 
                                             workers=7)

In [None]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in content_lda_model_tfidf.print_topics(-1):
    print("Topic: {} Word: {}".format(idx, topic))
    print("\n")

In [None]:
save_obj(lda_model, "gcconnex_blogs_lda_model")

In [None]:
save_obj(content_lda_model_tfidf, "gcconnex_blogs_tfidf_lda_model")

Seems like the regular LDA is giving better results.

### Start bow for communities

In [None]:
communities

In [None]:
# Transform communities into list (easier parsing) and create a list of labels

community_list = []
community_labels = []

for community in communities.items():
    name, data = community
    title, tags = data.values()
    community_list.append(" ".join(tags))
    community_labels.append(title)
    

In [None]:
print(f"{community_labels[2]}: {community_list[2]}")

In [None]:
# Tokenize and lemmatize community tags

community_tokens = []

for tags in community_list:
    community_tokens.append([lemmatize_stemming(token) for token in tokenizer.tokenize(tags)])
    
community_tokens

In [None]:
# Save our stuff

community_tags_dictionary = corpora.Dictionary(community_tokens) # could include prune_at=2000

community_tags_bow_corpus = [community_tags_dictionary.doc2bow(tokens) for tokens in community_tokens]

In [None]:
community_tags_dictionary.save(os.path.join(data_path, "community_tags_dictionary"))

In [None]:
community_tags_bow_corpus[3]

In [None]:
document = community_tags_bow_corpus[4]

preview_bow(document, community_tags_dictionary)

Now we need to figure a mapping from the content_tags to the community_tags.

For each set of content tags:
* see if the tag is in the community tags
* if yes, add community label & counter

In [None]:
blog_tokens = blogs.content_tokens

In [None]:
blog_tokens[1][:8]

In [None]:
# Identify communities via matching user generated tags against pre-defined community tags

import re
from collections import defaultdict

def identify_community(tag_list):
    
    community_list = defaultdict(int)
    
    # pre-populate community list with communities
    
    for community in communities:
                        
        c = communities.get(community)
        community_list[c['name']] = 0
    
    if isinstance(tag_list, list):
    
        for tag in tag_list:

                for community in communities:

                    c = communities.get(community)

                    for t in c['tags']:

                        if re.search(tag, t):
                            community_list[c['name']] += 1

    return community_list

In [None]:
test_doc = blog_tokens[1]

In [None]:
print(test_doc)
identify_community(test_doc)

In [None]:
blog_communities = []

for blog in blog_tokens:
    blog_communities.append(identify_community(blog))

In [None]:
blog_communities[1]

In [None]:
save_obj(blog_communities, "blog_communities") # List
save_obj(blog_tokens, "blog_tokens") # List

In [None]:
# Transform blog_communities into labels we can predict against

community_labels = []

for blog in blog_communities:
    new_list = []
    for k,v in blog.items():
        new_list.append(v)
        
    community_labels.append(new_list)

In [None]:
community_labels[2]

In [None]:
# Normalize the values so that we can use them as predictors
# Arbitrary number of 4 chosen here, but let's see what happens

def binarize(lst):
    new_list = []
    for item in lst:
        if item >= 4:
            new_list.append(1)
        else:
            new_list.append(0)
            
    return new_list

In [None]:
# Create array of binarized labels

binarized_labels = np.array([binarize(labels) for labels in community_labels])

In [None]:
binarized_labels[1]

In [None]:
save_obj(binarized_labels, "binarized_labels")

In [None]:
'''# Setting up for labelling blogs

for blog, community_info in zip(new_blogs, communities_list):
    
    if community_info:
        for community in communities:
            c = community_info.get(community, 0)
            blog[community] = c
    else:
        for community in communities:
            blog[community] = 0


# Add rows for each community to the DF

for community in communities:
    c = communities.get(community)
    blogs[c['name']] = 0'''

In [None]:
community_names = []

for community in communities:
                        
        c = communities.get(community)
        community_names.append(c['name'])

In [None]:
community_names

Okay. We now have blogs and their associated communities with absolute strengths.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()

In [None]:
# Need to put blogs back together for vectorizer to work
# FIX THIS

blogs_joined = [" ".join(tokens) for tokens in blog_tokens]

In [None]:
blogs_joined[1]

In [None]:
X_train_counts = count_vec.fit_transform(blogs_joined)

In [None]:
X_train_counts.shape

In [None]:
count_vec.vocabulary_.get(u'algorithm')

In [None]:
save_obj(count_vec, "count_vectorizer")

### Classifier - trying out one-vs-the-rest

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train_counts, binarized_labels, test_size=0.33, random_state=42)

In [None]:
y_test[1]

In [None]:
# Set up our multi-label classifier

multilabel_clf = OneVsRestClassifier(SVC(probability=True))

In [None]:
# Fit our classifier to the data

multilabel_clf.fit(X_train, y_train)

In [None]:
# Quick test of our accuracy score against our test data

multilabel_clf.score(X_test, y_test)

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# Cross Val score runs multiple tests against segments of our data
# It also takes a long time

scores = cross_val_score(
    multilabel_clf, X_train_counts, binarized_labels, cv=5)

In [None]:
print("Accuracy: {} (+/- {})".format(scores.mean(), scores.std() * 2))

In [None]:
predicted = multilabel_clf.predict(X_test)

In [None]:
from sklearn import metrics

print(metrics.classification_report(y_test, predicted,
                                    target_names=community_names))

In [None]:
#Save classifier 

save_obj(multilabel_clf, "multilabel_clf")

In [None]:
multilabel_clf = load_obj("multilabel_clf")

In [None]:
multilabel_clf

In [None]:
X_train[2]

In [None]:
# Test new blog

test_blog = '''I think innovation in the Public Service these days is fairly immature. 
So I think about a fairly disjointed community (if you can call it that) still trying to come 
to terms with what innovative public sector organizations even look like. I don't actually think 
we're alone though, as work the OECD is doing on innovation skills suggests many other jurisdictions 
are where we are. But I think our understanding of the value is fairly immature. 
I also think it'd be good to separate out innovation as a discipline ("we're doing that 
thing using innovation practices") and innovation as a product ("that thing we 
designed/built/implemented is innovative"). In the public service, I find we are generally 
quick to label our products as innovative ("that dragon's den we held was innovative"), but 
we haven't spent enough time developing innovation as a discipline ("we solved that problem 
by applying X innovative practice/method"). There's where I see the hubs/labs potentially adding 
value. But even there, I think we have a long way to go. Glad to see some work is underway to try 
to make sense of it all though. :)'''

In [None]:
test_tokens = tokenize(test_blog)
test_tokens = " ".join(test_tokens)
test_tokens_count = count_vec.transform([test_tokens])
test_tokens_count

In [None]:
import nltk_rake

In [None]:
rake = nltk_rake.RakeKeywordExtractor()
keywords = rake.extract(test_blog, incl_scores=True)

In [None]:
def isNumeric(word):
  try:
    float(word) if '.' in word else int(word)
    return True
  except ValueError:
    return False

In [None]:
y = "1 2 3 yes no maybe".split()

len([x for x in y if not isNumeric(x)])

In [None]:
blueprint = '''
Blueprint 2020 is a vision for a world-class Public Service equipped to serve Canada and Canadians now and into the future.

With around a quarter-million employees, the Public Service of Canada is the largest employer in the country. We work across more than a hundred departments and agencies, delivering important services to Canadians – from providing old age security and employment insurance benefits to protecting Canadian sovereignty to helping families save for higher education. We are responsible for regulating the safety of food and drugs, undertaking research and development to protect our shared environment, promoting Canada’s national interests around the world, and developing economic, trade and energy policies, among many other duties. We make a difference in the lives of Canadians every day. To ensure continued excellence in public service requires us to always ask how we can improve both our performance and our value to Canadians. The Blueprint 2020 initiative was devised to help us ask these questions, and to allow us to build tomorrow’s Public Service together.

Since June 2013, tens of thousands of public servants have shared their views on what it takes to ensure public service excellence. This input is redefining how we work and is making engagement part of our shared culture.

A team of employees working at the Canada Border Service Agency
By making a space to discuss our passion for public service and to act on our ideas for improvement, we are now well on our way to turning vision into reality. Building the Public Service of the future is a process that requires us all to commit to action, and to dedicate the time and effort necessary to see it through. We are all personally accountable for bringing about real change and realizing our ambitious goals.

For some examples of innovations that have arisen as a result of engaged employees working together to deliver results for Canadians, please check out the Clerk of the Privy Council’s latest Annual Report to the Prime Minister on the Public Service of Canada.

Blueprint 2020 Principles:
The Blueprint 2020 vision is guided by four principles, as outlined in the document Blueprint 2020: Getting Started – Getting Your Views, that help examine how work is done in the federal Public Service:

An open and networked environment that engages citizens and partners for the public good.
A whole-of-government approach that enhances service delivery and value for money.
A modern workplace that makes smart use of new technologies to improve networking, access to data and customer service.
A capable, confident and high-performing workforce that embraces new ways of working and mobilizes the diversity of talent to serve the country’s evolving needs.

'''

In [None]:
bp_tokens = tokenize(blueprint)
bp_tokens = " ".join(bp_tokens)
bp_tokens_count = count_vec.transform([bp_tokens])
bp_tokens_count

In [None]:
save_obj(community_names, "community_names") # list

In [None]:
predicted = multilabel_clf.predict(test_tokens_count)
predicted_prob = multilabel_clf.predict_proba(test_tokens_count)

In [None]:
print(predicted)
print(predicted_prob)

In [None]:
def predict_communities(predict_array):
    for i, element in enumerate(np.nditer(predict_array)):
        print("{}: {}".format(community_names[i], element))

In [None]:
predict_communities(predicted_prob)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

# Set up pipeline for new models

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', multilabel_clf)])


In [None]:
from pprint import pprint

In [None]:
from collections import defaultdict, OrderedDict

In [None]:
frequency = defaultdict(int)

In [None]:
# Remove words that occur less than 5 times and than have less than 3 letters

texts = [[token for token in text if frequency[token] > 5 if len(token) > 3]
         for text in texts]

In [None]:
# Count the frequency of each token from our text

for text in texts:
    for token in text:
        frequency[token] += 1

In [None]:
# Create a sorted dictionary based on the frequency

sorted_freq = OrderedDict(sorted(frequency.items(),
                                key=lambda kv: kv[1],
                                reverse=True))

In [None]:
sorted_freq

In [None]:
tfidf_model = models.tfidfmodel.TfidfModel(
    corpus)

In [None]:
with open('blogs.csv', 'w+', encoding='latin-1') as f:
    for blog in blogs:
        f.write(str(blog))


In [None]:
# Prep for NLTK analysis

full_text = "\n".join(str(blogs))


tokens = word_tokenize(full_text)
text = nltk.Text(tokens)
sens = nltk.sent_tokenize(raw)

In [None]:
freq_long(text)

In [None]:
pos_trigrams(text)