In [1]:
import numpy as np
import scipy.io as io
from nonnegfac.nmf import NMF

In [2]:
patent_data = io.loadmat('./wbd_1061v0(behavior)IE')

In [3]:
import plotly.plotly as py
import plotly.graph_objs as go

import igraph
from igraph import *
igraph.__version__


'0.7.1'

In [4]:
A = patent_data['V']
patent_no = patent_data['patentnum']
patent_titles = patent_data['titlecell']
words_ref = patent_data['words'][0]
k = 3
W, H, info = NMF().run(A, k)

[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 3, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.6924428939819336, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8626993596897558
}


In [5]:
# return top keywords per new topic
top_word_n = 5
topic_words = np.zeros((k, top_word_n))
for i in range(k):
    topic = W[:, i]
    topword_idxs = np.argsort(topic)[::-1][:top_word_n]
    # debug 
    # print np.sort(topic)[::-1][:top_word_n]
    # print [words[idx] for idx in topword_idxs]
    topic_words[i] = topword_idxs

In [6]:
patent_topic = np.argmax(H, axis=1)
print patent_topic[:10]
print H[:10]

[1 0 2 1 2 1 0 0 2 2]
[[0.         2.89584137 0.80518426]
 [3.52641659 0.         2.93928359]
 [0.26705384 0.8120327  1.60127109]
 [0.         2.13814157 1.11818973]
 [0.94375815 1.76806651 2.09288262]
 [0.07908864 3.53672562 0.        ]
 [2.38580548 0.08763515 0.        ]
 [1.00629312 0.85880524 0.        ]
 [0.         2.30072731 3.36774742]
 [0.         0.89568606 0.99134016]]


In [7]:
for i in range(k):
    topic_idx = i
    patent_idx = np.where(patent_topic==i)
    patent_nums = [patent_no[j] for j in patent_idx]
    patent_data = A[:, patent_idx]
    print patent_idx[0].shape
    print patent_data.shape

(285,)
(262, 1, 285)
(529,)
(262, 1, 529)
(247,)
(262, 1, 247)


In [8]:
def terminal_fn(data):
    n = 20
    r, c = data.shape
    return c < n; 

In [9]:
def nmf(data, k, term_fn, patent_no_ref, patent_names_ref, word_ref):
    
    print 'running nmf with data of shape', data.shape, ' and ', k, ' clusters'
    result = {}
    
    # if we reached terminal condition, return our leaf patents
    if term_fn(data):
        no_of_patents = int(data.shape[1])
        result['no patents'] = no_of_patents
        result['patent names'] = patent_names_ref        
        result['patents'] = []
        for i in range(no_of_patents):
            result['patents'].append(patent_no_ref[0][i][0][0][0])
        return result
    
    # run nmf on data
    W, H, info = NMF().run(data, k)
 
    # get topic words per topic
    top_word_n = 5
    topic_words = []
    topic_word_idxs = []
    for i in range(k):
        topic = W[:, i]
        topword_idxs = np.argsort(topic)[::-1][:top_word_n]
        # debug 
        # print np.sort(topic)[::-1][:top_word_n]
        # print [words[idx] for idx in topword_idxs]
        topic_word_idxs.append(topword_idxs)
        topic_words.append([word_ref[x] for x in topword_idxs])
        
    result['topic_keyword_idxs'] = topic_word_idxs
    result['topic_keywords'] = topic_words

    # get topic index of each patent and their respective document number and data
    patent_topics = np.argmax(H, axis=1)
#     print 'patent topics', patent_topics[:10]
#     print 'patent topics shape', patent_topics.shape
    
    child_patent_data = []
    child_patent_no_ref = []
    child_patent_names = []

#     for i in range(k):
#         print 'child data is of shape', child
        
    for i in range(k):
        topic_idx = i
        patent_idx = np.where(patent_topics==i)
        child_patent_no_ref.append([patent_no[j] for j in patent_idx])
        child_patent_name = (patent_names_ref[patent_idx])
        child_patent_names.append(child_patent_name)
        
        child_data = data[:, patent_idx]
        w, h, d = child_data.shape
        child_data = child_data.reshape(w, d)
        child_patent_data.append(child_data) 
    
    child_data = []
    for i in range(k):
        print child_patent_data[i].shape
        # If the matrix is not positive definite then we treat it like a cluster
        try:
            nmf_result = nmf(child_patent_data[i], k, term_fn, child_patent_no_ref[i], child_patent_names[i], word_ref)
        except:
            nmf_result = {}
            no_of_patents = int(child_patent_data[i].shape[1])
            nmf_result['no patents'] = no_of_patents
            nmf_result['patent names'] = child_patent_names[i]                    
            nmf_result['patents'] = []
            for m in range(no_of_patents):
                nmf_result['patents'].append(child_patent_no_ref[i][0][m][0][0][0])
        child_data.append(nmf_result)
            
    result['child_data'] = child_data
    
    return result     

In [10]:
import time
begin = time.time()
result = nmf(A, 2, terminal_fn, patent_no, patent_titles, words_ref)
print 'took ', time.time() - begin, 'seconds'

running nmf with data of shape (262, 1061)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.12380099296569824, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8829587744207933
}
(262, 492)
running nmf with data of shape (262, 492)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 492, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.06251406669616699, 
    "iterations": 50, 
    "norm_A": 148.8675565252701, 
    "rel_error": 0.8715761992813736
}
(262, 229)
running nmf with data of shape (262, 229)

[NMF] Completed: 
{
    "elapsed": 0.07061600685119629, 
    "iterations": 50, 
    "norm_A": 65.07058445730245, 
    "rel_error": 0.8082411908851171
}
(262, 31)
running nmf with data of shape (262, 31)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 31, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.04136300086975098, 
    "iterations": 50, 
    "norm_A": 44.402966440292964, 
    "rel_error": 0.730759193443987
}
(262, 5)
running nmf with data of shape (262, 5)  and  2  clusters
(262, 26)
running nmf with data of shape (262, 26)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 26, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
  

[NMF] Completed: 
{
    "elapsed": 0.05457496643066406, 
    "iterations": 50, 
    "norm_A": 80.68987943080522, 
    "rel_error": 0.8125837890442448
}
(262, 129)
running nmf with data of shape (262, 129)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 129, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.10542798042297363, 
    "iterations": 50, 
    "norm_A": 69.81752753780044, 
    "rel_error": 0.797806670833137
}
(262, 29)
running nmf with data of shape (262, 29)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 29, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.03551721572

[NMF] Completed: 
{
    "elapsed": 0.04145693778991699, 
    "iterations": 50, 
    "norm_A": 40.300719189226925, 
    "rel_error": 0.7895593711960727
}
(262, 42)
running nmf with data of shape (262, 42)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 42, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.06891083717346191, 
    "iterations": 50, 
    "norm_A": 28.106977081194284, 
    "rel_error": 0.7767241000085257
}
(262, 35)
running nmf with data of shape (262, 35)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 35, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.03616309165

[NMF] Completed: 
{
    "elapsed": 0.09789395332336426, 
    "iterations": 50, 
    "norm_A": 81.91534572608927, 
    "rel_error": 0.8127418036335665
}
(262, 93)
running nmf with data of shape (262, 93)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 93, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.04204702377319336, 
    "iterations": 50, 
    "norm_A": 52.57012169143684, 
    "rel_error": 0.8015665962974072
}
(262, 50)
running nmf with data of shape (262, 50)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 50, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.0458638668060

[NMF] Completed: 
{
    "elapsed": 0.08018016815185547, 
    "iterations": 50, 
    "norm_A": 23.964076865053784, 
    "rel_error": 0.7092048123281925
}
(262, 1)
running nmf with data of shape (262, 1)  and  2  clusters
(262, 36)
running nmf with data of shape (262, 36)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 36, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.04772210121154785, 
    "iterations": 50, 
    "norm_A": 23.12494164208928, 
    "rel_error": 0.707706471095012
}
(262, 10)
running nmf with data of shape (262, 10)  and  2  clusters
(262, 26)
running nmf with data of shape (262, 26)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 26, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_rand

In [11]:
result

{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'child_data': [{'no patents': 10,
             'patent names': array([[array([u'Turbine inlet condition controlled organic rankine cycle'],
                   dtype='<U133')],
                    [array([u'Apparatus for thermal treatment of moist raw material'],
                   dtype='<U242')],
                    [array([u'Moisture equalizer for a continuous flow grain dryer'],
                   dtype='<U242')],
                    [array([u'Rotating heat exchanger'], dtype='<U242')],
                    [array([u'Method and plant for the heat treatment of solids containing iron oxide     using a fluidized bed reactor'],
                   dtype='<U242')],
                    [array([u'Material treating apparatus'], dtype='<U242')],
                    [array([u'Tertiary heat exchanger'], dtype='<U242')],
                    [array([u'Shower bath apparatus and spray nozzle'], dtype='<U242')],
        

In [12]:
import json

In [13]:
a = { 'hello': ["1", 2, 3]}
json.dumps(a)

'{"hello": ["1", 2, 3]}'

In [16]:
graph_json = {}
nodes = []
links = []



In [17]:
def add_node(nodes, name, grp):
    nodes.append({"id": name, "group": grp})
def add_link(links, source, target):
    links.append({"source": source, "target": target, "value": 1})

In [18]:
add_node(nodes, 'root', 1)


In [19]:
def add_graph(data, parent_id, nodes, links):
    if 'child_data' in data.keys():
        for i in range(2):
            keywords = data['topic_keywords'][i]
            child_id = ''
            for w in keywords:
                child_id += w[0] + ' '
            child_id = child_id[:-1]
            print child_id
            add_node(nodes, child_id, 2)
            add_link(links, parent_id, child_id)
            add_graph(data['child_data'][i], child_id, nodes, links)
    else:
        for n in data['patent names']:
            p_name = n[0][0]
            add_node(nodes, p_name, 3)
            add_link(links, parent_id, p_name)
            

In [21]:
add_graph(result, "root", nodes, links)

electric cool mechanic alternative light
cool dry composite thermal acid
cool hot sectional cold thermal
hot cold downward clean sectional
hot cold downward cool sectional
hot dry cold cool rotary
hot downward cold cool combustible
clean seal cool rotary safe
cool thermal sectional alternative seal
thermal cold warm cryogenic solar
cool sectional downward seal alternative
cool clean composite alternative flex
cool sectional downward warm rapid
cool sectional atmospheric stationary alternative
cool thermal gaseous mechanic electric
cool sectional atmospheric downward seal
cool sectional stationary seal evaporative
cool evaporative controllable downward electric
cool sectional stationary seal diffuse
cool atmospheric downward rapid sectional
cool warm downward cold automatic
dry composite acid organic chemical
dry clean chemical sectional aqueous
dry hot sectional cool wet
dry thermal hot sectional chemical
thermal chemical atmospheric rapid seal
dry hot cool sectional alternative
dry ho

In [22]:
result['child_data'][0]['topic_keywords']

[[array([u'cool'], dtype='<U4'),
  array([u'dry'], dtype='<U3'),
  array([u'composite'], dtype='<U9'),
  array([u'thermal'], dtype='<U7'),
  array([u'acid'], dtype='<U4')],
 [array([u'electric'], dtype='<U8'),
  array([u'light'], dtype='<U5'),
  array([u'mechanic'], dtype='<U8'),
  array([u'alternative'], dtype='<U11'),
  array([u'electronic'], dtype='<U10')]]

In [23]:
graph_json["nodes"] = nodes
graph_json["links"] = links

In [24]:
json.dumps(graph_json)



In [49]:
def parse_data(data, parent_name, name):
    if parent_name == "":
        result = {"name": "root"}
        result["parent"] = ""
        result["children"] = []
        for i in range(2):
            keywords = data['topic_keywords'][i]
            child_id = ''
            for w in keywords:
                child_id += w[0] + ' '
            child_id = child_id[:-1]
            result["children"].append(parse_data(data['child_data'][i], "root", child_id))
        return result
    
    elif 'child_data' in data.keys():
        result = {"name": name}
        result["parent"] = parent_name;
        result["children"] = [];
        for i in range(2):
            keywords = data['topic_keywords'][i]
            child_id = ''
            for w in keywords:
                child_id += w[0] + ' '
            child_id = child_id[:-1]
            result["children"].append(parse_data(data['child_data'][i], name, child_id))
        return result
    
    else:
        result = {"name": name}
        result["parent"] = parent_name;
        result["children"] = [];        
        for n in data['patent names']:
            p_name = n[0][0]
            result["children"].append({"name": p_name, "parent": name, "children": []})            
        return result

In [50]:
json_result = parse_data(result, "", "")

In [51]:
json_result['children'][0]['name']

u'electric cool mechanic alternative light'

In [52]:
json.dumps(json_result)

