In [137]:
import numpy as np
import scipy.io as io
from nonnegfac.nmf import NMF

In [138]:
patent_data = io.loadmat('./wbd_1061v0(behavior)IE')

In [139]:
import plotly.plotly as py
import plotly.graph_objs as go

import igraph
from igraph import *
igraph.__version__


'0.7.1'

In [140]:
A = patent_data['V']
patent_no = patent_data['patentnum']
patent_titles = patent_data['titlecell']
words_ref = patent_data['words'][0]
k = 3
W, H, info = NMF().run(A, k)

[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 3, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.2043769359588623, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8626993596917103
}


In [141]:
# return top keywords per new topic
top_word_n = 5
topic_words = np.zeros((k, top_word_n))
for i in range(k):
    topic = W[:, i]
    topword_idxs = np.argsort(topic)[::-1][:top_word_n]
    # debug 
    # print np.sort(topic)[::-1][:top_word_n]
    # print [words[idx] for idx in topword_idxs]
    topic_words[i] = topword_idxs

In [142]:
patent_topic = np.argmax(H, axis=1)
print patent_topic[:10]
print H[:10]

[0 2 1 0 1 0 2 2 1 1]
[[2.89584207 0.80521251 0.        ]
 [0.         2.93935497 3.5263411 ]
 [0.81200767 1.60131322 0.2670525 ]
 [2.13812523 1.11819975 0.        ]
 [1.7680355  2.09293812 0.94375276]
 [3.53673966 0.         0.07908588]
 [0.0876245  0.         2.38579615]
 [0.85879918 0.         1.00629015]
 [2.30069641 3.36783948 0.        ]
 [0.89567208 0.99137559 0.        ]]


In [143]:
for i in range(k):
    topic_idx = i
    patent_idx = np.where(patent_topic==i)
    patent_nums = [patent_no[j] for j in patent_idx]
    patent_data = A[:, patent_idx]
    print patent_idx[0].shape
    print patent_data.shape

(529,)
(262, 1, 529)
(247,)
(262, 1, 247)
(285,)
(262, 1, 285)


In [144]:
def terminal_fn(data):
    n = 50
    r, c = data.shape
    return c < n; 

In [145]:
def nmf(data, k, term_fn, patent_no_ref, patent_names_ref, word_ref):
    
    print 'running nmf with data of shape', data.shape, ' and ', k, ' clusters'
    result = {}
    
    # if we reached terminal condition, return our leaf patents
    if term_fn(data):
        no_of_patents = int(data.shape[1])
        result['no patents'] = no_of_patents
        result['patent names'] = patent_names_ref        
        result['patents'] = []
        for i in range(no_of_patents):
            result['patents'].append(patent_no_ref[0][i][0][0][0])
        return result
    
    # run nmf on data
    W, H, info = NMF().run(data, k)
 
    # get topic words per topic
    top_word_n = 5
    topic_words = []
    topic_word_idxs = []
    for i in range(k):
        topic = W[:, i]
        topword_idxs = np.argsort(topic)[::-1][:top_word_n]
        # debug 
        # print np.sort(topic)[::-1][:top_word_n]
        # print [words[idx] for idx in topword_idxs]
        topic_word_idxs.append(topword_idxs)
        topic_words.append([word_ref[x] for x in topword_idxs])
        
    result['topic_keyword_idxs'] = topic_word_idxs
    result['topic_keywords'] = topic_words

    # get topic index of each patent and their respective document number and data
    patent_topics = np.argmax(H, axis=1)
#     print 'patent topics', patent_topics[:10]
#     print 'patent topics shape', patent_topics.shape
    
    child_patent_data = []
    child_patent_no_ref = []
    child_patent_names = []

#     for i in range(k):
#         print 'child data is of shape', child
        
    for i in range(k):
        topic_idx = i
        patent_idx = np.where(patent_topics==i)
        child_patent_no_ref.append([patent_no[j] for j in patent_idx])
        child_patent_name = (patent_names_ref[patent_idx])
        child_patent_names.append(child_patent_name)
        
        child_data = data[:, patent_idx]
        w, h, d = child_data.shape
        child_data = child_data.reshape(w, d)
        child_patent_data.append(child_data) 
    
    child_data = []
    for i in range(k):
        print child_patent_data[i].shape
        # If the matrix is not positive definite then we treat it like a cluster
        try:
            nmf_result = nmf(child_patent_data[i], k, term_fn, child_patent_no_ref[i], child_patent_names[i], word_ref)
        except:
            nmf_result = {}
            no_of_patents = int(child_patent_data[i].shape[1])
            nmf_result['no patents'] = no_of_patents
            nmf_result['patent names'] = child_patent_names[i]                    
            nmf_result['patents'] = []
            for m in range(no_of_patents):
                nmf_result['patents'].append(child_patent_no_ref[i][0][m][0][0][0])
        child_data.append(nmf_result)
            
    result['child_data'] = child_data
    
    return result     

In [146]:
import time
begin = time.time()
result = nmf(A, 2, terminal_fn, patent_no, patent_titles, words_ref)
print 'took ', time.time() - begin, 'seconds'

running nmf with data of shape (262, 1061)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.16401410102844238, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.882958774420761
}
(262, 492)
running nmf with data of shape (262, 492)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 492, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.08374309539794922, 
    "iterations": 50, 
    "norm_A": 148.8675565252701, 
    "rel_error": 0.8715761992812938
}
(262, 229)
running nmf with data of shape (262, 229) 

[NMF] Completed: 
{
    "elapsed": 0.0958249568939209, 
    "iterations": 50, 
    "norm_A": 41.454855036058156, 
    "rel_error": 0.7216484708116688
}
(262, 15)
running nmf with data of shape (262, 15)  and  2  clusters
(262, 43)
running nmf with data of shape (262, 43)  and  2  clusters
(262, 5)
running nmf with data of shape (262, 5)  and  2  clusters
(262, 24)
running nmf with data of shape (262, 24)  and  2  clusters
(262, 43)
running nmf with data of shape (262, 43)  and  2  clusters
(262, 569)
running nmf with data of shape (262, 569)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 569, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.08338594436645508, 
    "iterations": 50, 
    "norm_A": 132.557999182681, 
    "rel_error": 0.8444510435830073
}
(262, 393)
running nmf with 

[NMF] Completed: 
{
    "elapsed": 0.052330970764160156, 
    "iterations": 50, 
    "norm_A": 40.300719189226925, 
    "rel_error": 0.7895593712453356
}
(262, 42)
running nmf with data of shape (262, 42)  and  2  clusters
(262, 21)
running nmf with data of shape (262, 21)  and  2  clusters
(262, 27)
running nmf with data of shape (262, 27)  and  2  clusters
(262, 86)
running nmf with data of shape (262, 86)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 86, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.04103207588195801, 
    "iterations": 50, 
    "norm_A": 54.328186458453814, 
    "rel_error": 0.7561498378441242
}
(262, 61)
running nmf with data of shape (262, 61)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 61, 
    "A_type": "<type 'numpy.ndarray

In [147]:
result

{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'no patents': 27,
         'patent names': array([[array([u'Wind effect mitigation in cryogenic ambient air vaporizers'],
               dtype='<U242')],
                [array([u'Jet pump in natural circulation fossil fuel fired steam generator'],
               dtype='<U242')],
                [array([u'Heat recovery boiler for high pressure gas'], dtype='<U242')],
                [array([u'Turbine inlet condition controlled organic rankine cycle'],
               dtype='<U133')],
                [array([u'Direct-vented gas fireplace'], dtype='<U242')],
                [array([u'Immersible water heater'], dtype='<U242')],
                [array([u'Combustible particulate fuel heater'], dtype='<U242')],
                [array([u'Water heaters'], dtype='<U242')],
                [array([u'Apparatus for thermal treatment of moist raw material'],
               dtype='<U242')],
                [array([u'Moisture equalizer f

In [None]:
import json

In [149]:
a = { 'hello': ["1", 2, 3]}
json.dumps(a)

'{"hello": ["1", 2, 3]}'

In [150]:
graph_json = {}
nodes = []
links = []



In [151]:
def add_node(nodes, name, grp):
    nodes.append({"id": name, "group": grp})
def add_link(links, source, target):
    links.append({"source": source, "target": target, "value": 1})

In [152]:
add_node(nodes, 'root', 1)


In [153]:
def add_graph(data, parent_id, nodes, links):
    if 'child_data' in data.keys():
        for i in range(2):
            keywords = data['topic_keywords'][i]
            child_id = ''
            for w in keywords:
                child_id += w[0] + ' '
            child_id = child_id[:-1]
            print child_id
            add_node(nodes, child_id, 2)
            add_link(links, parent_id, child_id)
            add_graph(data['child_data'][i], child_id, nodes, links)
    else:
        for n in data['patent names']:
            p_name = n[0][0]
            add_node(nodes, p_name, 3)
            add_link(links, parent_id, p_name)
            

In [154]:
add_graph(result, "root", nodes, links)

electric cool mechanic alternative light
cool dry composite thermal acid
cool hot sectional cold thermal
hot cold downward clean sectional
cool thermal sectional alternative seal
cool sectional downward seal alternative
cool clean composite alternative flex
cool sectional downward warm rapid
thermal cold warm cryogenic solar
dry composite acid organic chemical
composite acid organic alternative aqueous
composite magnetic mechanic thermal chemical
acid aqueous dry organic fatty
dry clean chemical sectional aqueous
dry hot sectional cool wet
clean chemical seal aqueous alternative
electric light mechanic alternative electronic
electric mechanic alternative electronic hydraulic
hydraulic variable automatic mechanic alternative
hydraulic electric automatic mechanic reverse
variable alternative safe electric static
electric mechanic alternative conductive sectional
clean magnetic mechanic rotary alternative
electric conductive alternative sectional mechanic
electric conductive mechanic sect

In [155]:
result['child_data'][0]['topic_keywords']

[[array([u'cool'], dtype='<U4'),
  array([u'dry'], dtype='<U3'),
  array([u'composite'], dtype='<U9'),
  array([u'thermal'], dtype='<U7'),
  array([u'acid'], dtype='<U4')],
 [array([u'electric'], dtype='<U8'),
  array([u'light'], dtype='<U5'),
  array([u'mechanic'], dtype='<U8'),
  array([u'alternative'], dtype='<U11'),
  array([u'electronic'], dtype='<U10')]]

In [156]:
graph_json["nodes"] = nodes
graph_json["links"] = links

In [157]:
json.dumps(graph_json)

