In [1]:
import numpy as np
import scipy.io as io
from nonnegfac.nmf import NMF

In [7]:
patent_data = io.loadmat('./wbd_1061v0(behavior)IE')

In [8]:
A = patent_data['V']
patent_no = patent_data['patentnum']
patent_titles = patent_data['titlecell']
words_ref = patent_data['words'][0]

In [9]:
patent_no[0][0][0][0]

5712454

In [10]:
k = 3
W, H, info = NMF().run(A, k)

[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 3, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.18800020217895508, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8626993596897804
}


In [11]:
print A.shape
print W.shape
print H.shape
print info

(262L, 1061L)
(262L, 3L)
(1061L, 3L)
{'info': {'A_dim_1': 262L, 'A_dim_2': 1061L, 'init': 'uniform_random', 'verbose': 0, 'max_time': inf, 'alg': "<class 'nonnegfac.nmf.NMF'>", 'k': 3, 'max_iter': 50, 'A_type': "<type 'numpy.ndarray'>"}, 'final': {'rel_error': 0.8626993596897804, 'norm_A': 199.33181515533371, 'iterations': 50, 'elapsed': 0.18800020217895508}}


In [12]:
print W.shape
print H.shape

(262L, 3L)
(1061L, 3L)


In [13]:
# return top keywords per new topic
top_word_n = 5
topic_words = np.zeros((k, top_word_n))
for i in range(k):
    topic = W[:, i]
    topword_idxs = np.argsort(topic)[::-1][:top_word_n]
    # debug 
    # print np.sort(topic)[::-1][:top_word_n]
    # print [words[idx] for idx in topword_idxs]
    topic_words[i] = topword_idxs

In [14]:
patent_topic = np.argmax(H, axis=1)
print patent_topic[:10]
print H[:10]

[1 0 2 1 2 1 0 0 2 2]
[[0.         2.89584135 0.80518383]
 [3.52641825 0.         2.93928178]
 [0.26705383 0.81203171 1.60127148]
 [0.         2.13814028 1.11819114]
 [0.94375728 1.76806641 2.09288278]
 [0.07908872 3.53672587 0.        ]
 [2.38580629 0.08763528 0.        ]
 [1.00629314 0.85880542 0.        ]
 [0.         2.3007257  3.36774711]
 [0.         0.89568558 0.99134018]]


In [15]:
for i in range(k):
    topic_idx = i
    patent_idx = np.where(patent_topic==i)
    patent_nums = [patent_no[j] for j in patent_idx]
    patent_data = A[:, patent_idx]
    print patent_idx[0].shape
    print patent_data.shape

(285L,)
(262L, 1L, 285L)
(529L,)
(262L, 1L, 529L)
(247L,)
(262L, 1L, 247L)


In [16]:
# takes a patent matrix of N by vocab_n matrix, return all patents if terminal condition is met
# else return left and right components, patents that belong to them, and 
# their respective vocabs words, 



In [17]:
topic_idx = 0
patent_idx = np.where(patent_topic==i)
patent_nums = [patent_no[j] for j in patent_idx]
patent_data = A[:, patent_idx]
print patent_idx[0].shape
print patent_data.shape



(247L,)
(262L, 1L, 247L)


In [18]:
w, h, d = patent_data.shape

In [19]:
patent_data.reshape(w, d).shape

(262L, 247L)

In [20]:
patent_nums[0][:10]

array([[array([[5517952]])],
       [array([[9556782]])],
       [array([[4979473]])],
       [array([[6345958]])],
       [array([[5997245]])],
       [array([[5791558]])],
       [array([[6434925]])],
       [array([[6557514]])],
       [array([[5243932]])],
       [array([[3974653]])]], dtype=object)

In [21]:
patent_no[0]

array([array([[5712454]])], dtype=object)

In [22]:
patent_no[patent_idx].shape

(247L, 1L)

In [122]:
def terminal_fn(data):
    n = 50
    r, c = data.shape
    return c < n; 

In [123]:
def nmf(data, k, term_fn, patent_no_ref, patent_names_ref, word_ref):
    
    print 'running nmf with data of shape', data.shape, ' and ', k, ' clusters'
    result = {}
    
    # if we reached terminal condition, return our leaf patents
    if term_fn(data):
        no_of_patents = int(data.shape[1])
        result['no patents'] = no_of_patents
        result['patent names'] = patent_names_ref        
        result['patents'] = []
        for i in range(no_of_patents):
            result['patents'].append(patent_no_ref[0][i][0][0][0])
        return result
    
    # run nmf on data
    W, H, info = NMF().run(data, k)
 
    # get topic words per topic
    top_word_n = 5
    topic_words = []
    topic_word_idxs = []
    for i in range(k):
        topic = W[:, i]
        topword_idxs = np.argsort(topic)[::-1][:top_word_n]
        # debug 
        # print np.sort(topic)[::-1][:top_word_n]
        # print [words[idx] for idx in topword_idxs]
        topic_word_idxs.append(topword_idxs)
        topic_words.append([word_ref[x] for x in topword_idxs])
        
    result['topic_keyword_idxs'] = topic_word_idxs
    result['topic_keywords'] = topic_words

    # get topic index of each patent and their respective document number and data
    patent_topics = np.argmax(H, axis=1)
#     print 'patent topics', patent_topics[:10]
#     print 'patent topics shape', patent_topics.shape
    
    child_patent_data = []
    child_patent_no_ref = []
    child_patent_names = []

#     for i in range(k):
#         print 'child data is of shape', child
        
    for i in range(k):
        topic_idx = i
        patent_idx = np.where(patent_topics==i)
        child_patent_no_ref.append([patent_no[j] for j in patent_idx])
        child_patent_name = (patent_names_ref[patent_idx])
        child_patent_names.append(child_patent_name)
        
        child_data = data[:, patent_idx]
        w, h, d = child_data.shape
        child_data = child_data.reshape(w, d)
        child_patent_data.append(child_data) 
    
    child_data = []
    for i in range(k):
        print child_patent_data[i].shape
        # If the matrix is not positive definite then we treat it like a cluster
        try:
            nmf_result = nmf(child_patent_data[i], k, term_fn, child_patent_no_ref[i], child_patent_names[i], word_ref)
        except:
            nmf_result = {}
            no_of_patents = int(child_patent_data[i].shape[1])
            nmf_result['no patents'] = no_of_patents
            nmf_result['patent names'] = child_patent_names[i]                    
            nmf_result['patents'] = []
            for m in range(no_of_patents):
                nmf_result['patents'].append(child_patent_no_ref[i][0][m][0][0][0])
        child_data.append(nmf_result)
            
    result['child_data'] = child_data
    
    return result     

In [124]:
import time
begin = time.time()
result = nmf(A, 2, terminal_fn, patent_no, patent_titles, words_ref)
print 'took ', time.time() - begin, 'seconds'

running nmf with data of shape (262L, 1061L)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.10700011253356934, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8829587744206883
}
(262L, 569L)
running nmf with data of shape (262L, 569L)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 569, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.11599993705749512, 
    "iterations": 50, 
    "norm_A": 132.557999182681, 
    "rel_error": 0.8444513341547072
}
(262L, 176L)
running nmf with data of shape (26

[NMF] Completed: 
{
    "elapsed": 0.07999992370605469, 
    "iterations": 50, 
    "norm_A": 46.92486837910923, 
    "rel_error": 0.7659671756743441
}
(262L, 24L)
running nmf with data of shape (262L, 24L)  and  2  clusters
(262L, 78L)
running nmf with data of shape (262L, 78L)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 78, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.07899999618530273, 
    "iterations": 50, 
    "norm_A": 40.28551097501079, 
    "rel_error": 0.7425808357484969
}
(262L, 16L)
running nmf with data of shape (262L, 16L)  and  2  clusters
(262L, 62L)
running nmf with data of shape (262L, 62L)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 62, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "in

[NMF] Completed: 
{
    "elapsed": 0.07899999618530273, 
    "iterations": 50, 
    "norm_A": 80.68987943080522, 
    "rel_error": 0.81258382130439
}
(262L, 129L)
running nmf with data of shape (262L, 129L)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 129, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.07599997520446777, 
    "iterations": 50, 
    "norm_A": 69.81752753780044, 
    "rel_error": 0.7990558630428385
}
(262L, 33L)
running nmf with data of shape (262L, 33L)  and  2  clusters
(262L, 96L)
running nmf with data of shape (262L, 96L)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 96, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time"

In [125]:
(result['child_data'][0]['child_data'][0])

{'child_data': [{'child_data': [{'no patents': 26,
     'patent names': array([[array([u'Two-step finger follower rocker arm'], dtype='<U242')],
            [array([u'Zero turn drive apparatus'], dtype='<U242')],
            [array([u'Hydraulic fluid control device for a hydraulic power-assisted steering     system'],
           dtype='<U242')],
            [array([u'Hydraulic control circuit in a hydraulic excavator'],
           dtype='<U242')],
            [array([u'Pressure compensating hydraulic control system'], dtype='<U242')],
            [array([u'Arrangement for controlling a hydraulically driven motor'],
           dtype='<U242')],
            [array([u'Fluid pressure driven rotary actuator and method of operating the same'],
           dtype='<U132')],
            [array([u'Hydraulic clutch'], dtype='<U242')],
            [array([u'Device for removing fluid from a container'], dtype='<U242')],
            [array([u'Retractable underwater turret'], dtype='<U242')],
         