In [158]:
import numpy as np
import scipy.io as io
from nonnegfac.nmf import NMF

In [159]:
patent_data = io.loadmat('./wbd_1061v0(behavior)IE')

In [160]:
import plotly.plotly as py
import plotly.graph_objs as go

import igraph
from igraph import *
igraph.__version__


'0.7.1'

In [161]:
A = patent_data['V']
patent_no = patent_data['patentnum']
patent_titles = patent_data['titlecell']
words_ref = patent_data['words'][0]
k = 3
W, H, info = NMF().run(A, k)

[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 3, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.23614501953125, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8626993597326436
}


In [162]:
# return top keywords per new topic
top_word_n = 5
topic_words = np.zeros((k, top_word_n))
for i in range(k):
    topic = W[:, i]
    topword_idxs = np.argsort(topic)[::-1][:top_word_n]
    # debug 
    # print np.sort(topic)[::-1][:top_word_n]
    # print [words[idx] for idx in topword_idxs]
    topic_words[i] = topword_idxs

In [163]:
patent_topic = np.argmax(H, axis=1)
print patent_topic[:10]
print H[:10]

[1 0 2 1 2 1 0 0 2 2]
[[0.         2.8958441  0.80528445]
 [3.52615039 0.         2.93953306]
 [0.26704881 0.8119373  1.60142602]
 [0.         2.13807551 1.11823417]
 [0.94373631 1.76795348 2.0930831 ]
 [0.07907877 3.53677809 0.        ]
 [2.38577401 0.08759781 0.        ]
 [1.00628266 0.85878393 0.        ]
 [0.         2.30060775 3.36808014]
 [0.         0.89563327 0.99146846]]


In [164]:
for i in range(k):
    topic_idx = i
    patent_idx = np.where(patent_topic==i)
    patent_nums = [patent_no[j] for j in patent_idx]
    patent_data = A[:, patent_idx]
    print patent_idx[0].shape
    print patent_data.shape

(285,)
(262, 1, 285)
(529,)
(262, 1, 529)
(247,)
(262, 1, 247)


In [165]:
def terminal_fn(data):
    n = 20
    r, c = data.shape
    return c < n; 

In [166]:
def nmf(data, k, term_fn, patent_no_ref, patent_names_ref, word_ref):
    
    print 'running nmf with data of shape', data.shape, ' and ', k, ' clusters'
    result = {}
    
    # if we reached terminal condition, return our leaf patents
    if term_fn(data):
        no_of_patents = int(data.shape[1])
        result['no patents'] = no_of_patents
        result['patent names'] = patent_names_ref        
        result['patents'] = []
        for i in range(no_of_patents):
            result['patents'].append(patent_no_ref[0][i][0][0][0])
        return result
    
    # run nmf on data
    W, H, info = NMF().run(data, k)
 
    # get topic words per topic
    top_word_n = 5
    topic_words = []
    topic_word_idxs = []
    for i in range(k):
        topic = W[:, i]
        topword_idxs = np.argsort(topic)[::-1][:top_word_n]
        # debug 
        # print np.sort(topic)[::-1][:top_word_n]
        # print [words[idx] for idx in topword_idxs]
        topic_word_idxs.append(topword_idxs)
        topic_words.append([word_ref[x] for x in topword_idxs])
        
    result['topic_keyword_idxs'] = topic_word_idxs
    result['topic_keywords'] = topic_words

    # get topic index of each patent and their respective document number and data
    patent_topics = np.argmax(H, axis=1)
#     print 'patent topics', patent_topics[:10]
#     print 'patent topics shape', patent_topics.shape
    
    child_patent_data = []
    child_patent_no_ref = []
    child_patent_names = []

#     for i in range(k):
#         print 'child data is of shape', child
        
    for i in range(k):
        topic_idx = i
        patent_idx = np.where(patent_topics==i)
        child_patent_no_ref.append([patent_no[j] for j in patent_idx])
        child_patent_name = (patent_names_ref[patent_idx])
        child_patent_names.append(child_patent_name)
        
        child_data = data[:, patent_idx]
        w, h, d = child_data.shape
        child_data = child_data.reshape(w, d)
        child_patent_data.append(child_data) 
    
    child_data = []
    for i in range(k):
        print child_patent_data[i].shape
        # If the matrix is not positive definite then we treat it like a cluster
        try:
            nmf_result = nmf(child_patent_data[i], k, term_fn, child_patent_no_ref[i], child_patent_names[i], word_ref)
        except:
            nmf_result = {}
            no_of_patents = int(child_patent_data[i].shape[1])
            nmf_result['no patents'] = no_of_patents
            nmf_result['patent names'] = child_patent_names[i]                    
            nmf_result['patents'] = []
            for m in range(no_of_patents):
                nmf_result['patents'].append(child_patent_no_ref[i][0][m][0][0][0])
        child_data.append(nmf_result)
            
    result['child_data'] = child_data
    
    return result     

In [167]:
import time
begin = time.time()
result = nmf(A, 2, terminal_fn, patent_no, patent_titles, words_ref)
print 'took ', time.time() - begin, 'seconds'

running nmf with data of shape (262, 1061)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.11841702461242676, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8829587744207094
}
(262, 492)
running nmf with data of shape (262, 492)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 492, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.07071590423583984, 
    "iterations": 50, 
    "norm_A": 148.8675565252701, 
    "rel_error": 0.8715761992814103
}
(262, 229)
running nmf with data of shape (262, 229)

[NMF] Completed: 
{
    "elapsed": 0.04173111915588379, 
    "iterations": 50, 
    "norm_A": 30.69279975569446, 
    "rel_error": 0.6966539077701291
}
(262, 2)
running nmf with data of shape (262, 2)  and  2  clusters
(262, 22)
running nmf with data of shape (262, 22)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 22, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.10463380813598633, 
    "iterations": 50, 
    "norm_A": 28.96607630989542, 
    "rel_error": 0.6847074848858882
}
(262, 14)
running nmf with data of shape (262, 14)  and  2  clusters
(262, 8)
running nmf with data of shape (262, 8)  and  2  clusters
(262, 8)
running nmf with data of shape (262, 8)  and  2  clusters
(262, 96)
running nmf with data of shape (262, 96)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 2

[NMF] Completed: 
{
    "elapsed": 0.07909798622131348, 
    "iterations": 50, 
    "norm_A": 39.40330058206659, 
    "rel_error": 0.6887661169157083
}
(262, 12)
running nmf with data of shape (262, 12)  and  2  clusters
(262, 27)
running nmf with data of shape (262, 27)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 27, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.061859130859375, 
    "iterations": 50, 
    "norm_A": 32.02811643601493, 
    "rel_error": 0.6317882006036253
}
(262, 14)
running nmf with data of shape (262, 14)  and  2  clusters
(262, 13)
running nmf with data of shape (262, 13)  and  2  clusters
(262, 4)
running nmf with data of shape (262, 4)  and  2  clusters
(262, 220)
running nmf with data of shape (262, 220)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1

[NMF] Completed: 
{
    "elapsed": 0.04468989372253418, 
    "iterations": 50, 
    "norm_A": 34.706451689055, 
    "rel_error": 0.7588802038122187
}
(262, 18)
running nmf with data of shape (262, 18)  and  2  clusters
(262, 16)
running nmf with data of shape (262, 16)  and  2  clusters
(262, 26)
running nmf with data of shape (262, 26)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 26, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.06045079231262207, 
    "iterations": 50, 
    "norm_A": 33.759117256090995, 
    "rel_error": 0.7108021590522557
}
(262, 23)
running nmf with data of shape (262, 23)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 23, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_ran

[NMF] Completed: 
{
    "elapsed": 0.077056884765625, 
    "iterations": 50, 
    "norm_A": 81.91534572608927, 
    "rel_error": 0.8127409746844214
}
(262, 150)
running nmf with data of shape (262, 150)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 150, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.04410409927368164, 
    "iterations": 50, 
    "norm_A": 62.728122885803174, 
    "rel_error": 0.7851745231702937
}
(262, 102)
running nmf with data of shape (262, 102)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 102, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.03998613

[NMF] Completed: 
{
    "elapsed": 0.06917905807495117, 
    "iterations": 50, 
    "norm_A": 31.452767909710577, 
    "rel_error": 0.7282672682744867
}
(262, 15)
running nmf with data of shape (262, 15)  and  2  clusters
(262, 16)
running nmf with data of shape (262, 16)  and  2  clusters
(262, 94)
running nmf with data of shape (262, 94)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 94, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.047373056411743164, 
    "iterations": 50, 
    "norm_A": 52.68117751767044, 
    "rel_error": 0.8006633682282012
}
(262, 47)
running nmf with data of shape (262, 47)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 47, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_

[NMF] Completed: 
{
    "elapsed": 0.06458306312561035, 
    "iterations": 50, 
    "norm_A": 14.985176963575649, 
    "rel_error": 0.6733777162537535
}
(262, 19)
running nmf with data of shape (262, 19)  and  2  clusters
(262, 1)
running nmf with data of shape (262, 1)  and  2  clusters
(262, 86)
running nmf with data of shape (262, 86)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 86, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.04100990295410156, 
    "iterations": 50, 
    "norm_A": 54.328186458453814, 
    "rel_error": 0.7561498573289708
}
(262, 25)
running nmf with data of shape (262, 25)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 25, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_ra

In [168]:
result

{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'no patents': 5,
             'patent names': array([[array([u'Plants and seeds of hybrid corn variety CH811261'], dtype='<U242')],
                    [array([u'Plants and seeds of corn variety I062687'], dtype='<U242')],
                    [array([u'Maize variety hybrid X7K442'], dtype='<U242')],
                    [array([u'Modified cDNA of rat bcl-x gene and modified protein'],
                   dtype='<U242')],
                    [array([u'Malathion carboxylesterase'], dtype='<U242')]],
                   dtype=object),
             'patents': [9556782, 9482122, 8869772, 4979473, 7231894]},
            {'child_data': [{'child_data': [{'no patents': 16,
                 'patent names': array([[array([u'Aniline derivatives and their use for controlling undesirable plant     growth'],
                       dtype='<U132')],
                        [array([u'Biocidal thiadiazolylmercap

In [169]:
import json

In [170]:
a = { 'hello': ["1", 2, 3]}
json.dumps(a)

'{"hello": ["1", 2, 3]}'

In [171]:
graph_json = {}
nodes = []
links = []



In [172]:
def add_node(nodes, name, grp):
    nodes.append({"id": name, "group": grp})
def add_link(links, source, target):
    links.append({"source": source, "target": target, "value": 1})

In [173]:
add_node(nodes, 'root', 1)


In [174]:
def add_graph(data, parent_id, nodes, links):
    if 'child_data' in data.keys():
        for i in range(2):
            keywords = data['topic_keywords'][i]
            child_id = ''
            for w in keywords:
                child_id += w[0] + ' '
            child_id = child_id[:-1]
            print child_id
            add_node(nodes, child_id, 2)
            add_link(links, parent_id, child_id)
            add_graph(data['child_data'][i], child_id, nodes, links)
    else:
        for n in data['patent names']:
            p_name = n[0][0]
            add_node(nodes, p_name, 3)
            add_link(links, parent_id, p_name)
            

In [175]:
add_graph(result, "root", nodes, links)

electric cool mechanic alternative light
cool dry composite thermal acid
dry composite acid organic chemical
composite acid organic alternative aqueous
acid aqueous dry organic fatty
genetic hybrid inbred acid resistant
acid composite aqueous organic dry
acid composite organic aqueous dry
acid composite aqueous dry organic
soluble acid clean sterile medical
pharmaceutical acid composite soluble fatty
composite magnetic mechanic thermal chemical
composite thermal mechanic organic alternative
edible composite alternative cool dry
composite thermal mechanic chemical ceramic
composite thermal mechanic organic chemical
dynamic composite chemical flex elastomeric
composite thermal organic mechanic resistant
composite thermal mechanic seal hard
composite organic chemical safe alternative
composite ceramic ultrasonic wet molten
magnetic isolate composite compact inert
dry clean chemical sectional aqueous
clean chemical seal aqueous alternative
adhesive seal chemical alternative dry
clean chemi

In [176]:
result['child_data'][0]['topic_keywords']

[[array([u'cool'], dtype='<U4'),
  array([u'dry'], dtype='<U3'),
  array([u'composite'], dtype='<U9'),
  array([u'thermal'], dtype='<U7'),
  array([u'acid'], dtype='<U4')],
 [array([u'electric'], dtype='<U8'),
  array([u'light'], dtype='<U5'),
  array([u'mechanic'], dtype='<U8'),
  array([u'alternative'], dtype='<U11'),
  array([u'electronic'], dtype='<U10')]]

In [177]:
graph_json["nodes"] = nodes
graph_json["links"] = links

In [178]:
json.dumps(graph_json)

