## Canada Open Data Inventory using LDA

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 800)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
OPEN_DATA_URL = '../data/canada-open-data/inventory.csv'

import re

HANDLE = '@\w+'
LINK = 'https?://t\.co/\w+'
SPECIAL_CHARS = '&lt;|&lt;|&amp;|#'
PARA='\n+'
def clean(text):
    text = re.sub(LINK, ' ', text)
    text = re.sub(SPECIAL_CHARS, ' ', text)
    text = re.sub(PARA, '\n', text)
    return text

catalog=pd.read_csv(OPEN_DATA_URL)
catalog = catalog.dropna(subset=['description_en'])
file='../data/canada-open-data/catalog.txt'
catalog['description_en'].sample(frac=0.25,replace=False,random_state=0).to_csv(file,encoding='utf-8')
f=open(file,'r',encoding='utf-8')
text=f.read()
f.close()
text = clean(text)

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc=nlp(text)
pos_list=['NOUN']
preproc_text=[]
preproc_sent=[]

for token in doc:
    if token.text!='\n':
        if not(token.is_stop) and not(token.is_punct) and token.pos_ in pos_list:
            preproc_sent.append(token.lemma_)
    else:
        preproc_text.append(preproc_sent)
        preproc_sent=[]

preproc_text.append(preproc_sent) #last sentence

print(preproc_text)

[[], ['crop', 'residue', 'year', 'census'], ['investment', 'transit', 'infrastructure', 'city', 'community', 'funding', 'environment', 'greenhouse', 'gas', 'emission', 'traffic', 'congestion', 'funding', 'province', 'territory', 'capita', 'basis'], ['need', 'background', 'soil', 'datum', 'assessment', 'site', 'region', 'data', 'region', 'background', 'soil', 'concentration', 'metal', 'area', 'concentration', 'soil', 'quality', 'guideline', 'jurisdiction', 'soil', 'database', 'database', 'region', 'background', 'soil', 'screening', 'site', 'datum', 'background', 'range'], ['vegetable', 'storage', 'factory'], ['report', 'account'], ['facility', 'region', 'location', 'truck', 'trip', 'end', 'storage', 'handling', 'facility', 'business', 'dg'], ['park', 'pitcher', 'plant', 'morpology', 'availability', 'nitrogen', 'pitcher', 'plant', 'development'], ['innovation', 'business', 'strategy', 'product', 'good', 'service', 'enterprise', 'market', 'enterprise', 'size', 'industry'], ['dataset', 're

In [15]:
import tomotopy as tp
NUM_TOPICS=20
mdl = tp.LDAModel(k=NUM_TOPICS,seed=1234)

for line in preproc_text:
    mdl.add_doc(line)

for i in range(0, 110, 10):
    mdl.train(i)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

Iteration: 0	Log-likelihood: -11.093217577268552
Iteration: 10	Log-likelihood: -6.8822797912226115
Iteration: 20	Log-likelihood: -6.317129241581733
Iteration: 30	Log-likelihood: -6.157586638884254
Iteration: 40	Log-likelihood: -6.073628903605757
Iteration: 50	Log-likelihood: -6.0291570377492905
Iteration: 60	Log-likelihood: -6.005991344426762
Iteration: 70	Log-likelihood: -5.975599517879777
Iteration: 80	Log-likelihood: -5.959173736422274
Iteration: 90	Log-likelihood: -5.939598846671805
Iteration: 100	Log-likelihood: -5.935156891936913


In [25]:
mdl.train(10)
for k in range(mdl.k):
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.get_topic_words(k, top_n=7))

Top 10 words of topic #0
[('polygon', 0.036050185561180115), ('dataset', 0.033475782722234726), ('information', 0.03004324994981289), ('soil', 0.029185116291046143), ('area', 0.026610717177391052), ('surface', 0.025752583518624306), ('map', 0.024036318063735962)]
Top 10 words of topic #1
[('province', 0.04648945853114128), ('group', 0.03703557699918747), ('age', 0.03467210754752159), ('sex', 0.03309646248817444), ('territory', 0.03073299117386341), ('student', 0.02521822601556778), ('child', 0.023642580956220627)]
Top 10 words of topic #2
[('business', 0.05287355184555054), ('permit', 0.049527548253536224), ('enterprise', 0.0435047410428524), ('work', 0.041497137397527695), ('year', 0.03748193010687828), ('size', 0.03681273013353348), ('innovation', 0.03480513021349907)]
Top 10 words of topic #3
[('datum', 0.03372278809547424), ('forest', 0.020649272948503494), ('specie', 0.016520794481039047), ('ozone', 0.013768475502729416), ('abundance', 0.013768475502729416), ('area', 0.01308039575

In [33]:
bag_of_words=[word for sent in preproc_text for word in sent]
doc_inst = mdl.make_doc(bag_of_words)
mdl.infer(doc_inst)[0]
np.argsort(np.array(mdl.infer(doc_inst)[0]))[::-1]

array([11, 17, 14, 19, 12,  7,  4, 13, 10,  2,  3, 15,  1, 18, 16,  9,  0,
        6,  8,  5], dtype=int64)

In [38]:
print(mdl.get_topic_words(11, top_n=7))

[('table', 0.24849626421928406), ('census', 0.1265643984079361), ('level', 0.06526772677898407), ('series', 0.06306280940771103), ('topic', 0.062401335686445236), ('geography', 0.062401335686445236), ('country', 0.06218084320425987)]


In [39]:
print(mdl.get_topic_words(17, top_n=7))

[('datum', 0.0603327676653862), ('information', 0.057247743010520935), ('year', 0.03462424501776695), ('dataset', 0.03291034325957298), ('project', 0.01782800629734993), ('website', 0.014057422056794167), ('activity', 0.012000739574432373)]


In [41]:
print(mdl.get_topic_words(5, top_n=7))

[('survey', 0.04966237023472786), ('catch', 0.03862873837351799), ('sponge', 0.0364220105111599), ('sea', 0.0342152863740921), ('datum', 0.028698472306132317), ('fishing', 0.02759511023759842), ('matter', 0.026491746306419373)]
