In [1]:
import numpy as np
import scipy.io as io
from nonnegfac.nmf import NMF

In [2]:
patent_data = io.loadmat('./wbd_1061v0(behavior)IE')

In [4]:
import plotly.plotly as py
import plotly.graph_objs as go

import igraph
from igraph import *
igraph.__version__


'0.7.1'

In [5]:
A = patent_data['V']
patent_no = patent_data['patentnum']
patent_titles = patent_data['titlecell']
words_ref = patent_data['words'][0]
k = 3
W, H, info = NMF().run(A, k)

[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 3, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.1533968448638916, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.8626993596895006
}


In [6]:
# return top keywords per new topic
top_word_n = 5
topic_words = np.zeros((k, top_word_n))
for i in range(k):
    topic = W[:, i]
    topword_idxs = np.argsort(topic)[::-1][:top_word_n]
    # debug 
    # print np.sort(topic)[::-1][:top_word_n]
    # print [words[idx] for idx in topword_idxs]
    topic_words[i] = topword_idxs

In [7]:
patent_topic = np.argmax(H, axis=1)
print patent_topic[:10]
print H[:10]

[2 0 1 2 1 2 0 0 1 1]
[[0.         0.80519196 2.89584164]
 [3.52639654 2.93930166 0.        ]
 [0.26705337 1.60128459 0.81202339]
 [0.         1.11819581 2.13813408]
 [0.94375564 2.09289918 1.76805684]
 [0.07908793 0.         3.53673042]
 [2.38580367 0.         0.08763233]
 [1.00629235 0.         0.85880365]
 [0.         3.36777473 2.30071519]
 [0.         0.99135079 0.89568103]]


In [8]:
for i in range(k):
    topic_idx = i
    patent_idx = np.where(patent_topic==i)
    patent_nums = [patent_no[j] for j in patent_idx]
    patent_data = A[:, patent_idx]
    print patent_idx[0].shape
    print patent_data.shape

(285,)
(262, 1, 285)
(247,)
(262, 1, 247)
(529,)
(262, 1, 529)


In [13]:
def terminal_fn(data):
    n = 150
    r, c = data.shape
    return c < n; 

In [14]:
def nmf(data, k, term_fn, patent_no_ref, patent_names_ref, word_ref):
    
    print 'running nmf with data of shape', data.shape, ' and ', k, ' clusters'
    result = {}
    
    # if we reached terminal condition, return our leaf patents
    if term_fn(data):
        no_of_patents = int(data.shape[1])
        result['no patents'] = no_of_patents
        result['patent names'] = patent_names_ref        
        result['patents'] = []
        for i in range(no_of_patents):
            result['patents'].append(patent_no_ref[0][i][0][0][0])
        return result
    
    # run nmf on data
    W, H, info = NMF().run(data, k)
 
    # get topic words per topic
    top_word_n = 5
    topic_words = []
    topic_word_idxs = []
    for i in range(k):
        topic = W[:, i]
        topword_idxs = np.argsort(topic)[::-1][:top_word_n]
        # debug 
        # print np.sort(topic)[::-1][:top_word_n]
        # print [words[idx] for idx in topword_idxs]
        topic_word_idxs.append(topword_idxs)
        topic_words.append([word_ref[x] for x in topword_idxs])
        
    result['topic_keyword_idxs'] = topic_word_idxs
    result['topic_keywords'] = topic_words

    # get topic index of each patent and their respective document number and data
    patent_topics = np.argmax(H, axis=1)
#     print 'patent topics', patent_topics[:10]
#     print 'patent topics shape', patent_topics.shape
    
    child_patent_data = []
    child_patent_no_ref = []
    child_patent_names = []

#     for i in range(k):
#         print 'child data is of shape', child
        
    for i in range(k):
        topic_idx = i
        patent_idx = np.where(patent_topics==i)
        child_patent_no_ref.append([patent_no[j] for j in patent_idx])
        child_patent_name = (patent_names_ref[patent_idx])
        child_patent_names.append(child_patent_name)
        
        child_data = data[:, patent_idx]
        w, h, d = child_data.shape
        child_data = child_data.reshape(w, d)
        child_patent_data.append(child_data) 
    
    child_data = []
    for i in range(k):
        print child_patent_data[i].shape
        # If the matrix is not positive definite then we treat it like a cluster
        try:
            nmf_result = nmf(child_patent_data[i], k, term_fn, child_patent_no_ref[i], child_patent_names[i], word_ref)
        except:
            nmf_result = {}
            no_of_patents = int(child_patent_data[i].shape[1])
            nmf_result['no patents'] = no_of_patents
            nmf_result['patent names'] = child_patent_names[i]                    
            nmf_result['patents'] = []
            for m in range(no_of_patents):
                nmf_result['patents'].append(child_patent_no_ref[i][0][m][0][0][0])
        child_data.append(nmf_result)
            
    result['child_data'] = child_data
    
    return result     

In [15]:
import time
begin = time.time()
result = nmf(A, 2, terminal_fn, patent_no, patent_titles, words_ref)
print 'took ', time.time() - begin, 'seconds'

running nmf with data of shape (262, 1061)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 1061, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.11518406867980957, 
    "iterations": 50, 
    "norm_A": 199.33181515533371, 
    "rel_error": 0.882958774420715
}
(262, 492)
running nmf with data of shape (262, 492)  and  2  clusters
[NMF] Running: 
{
    "A_dim_1": 262, 
    "A_dim_2": 492, 
    "A_type": "<type 'numpy.ndarray'>", 
    "alg": "<class 'nonnegfac.nmf.NMF'>", 
    "init": "uniform_random", 
    "k": 2, 
    "max_iter": 50, 
    "max_time": Infinity, 
    "verbose": 0
}
[NMF] Completed: 
{
    "elapsed": 0.07377910614013672, 
    "iterations": 50, 
    "norm_A": 148.8675565252701, 
    "rel_error": 0.8715761992812955
}
(262, 229)
running nmf with data of shape (262, 229) 

In [121]:
labels = {0: ['root']}
def add_label(result, labels, level):
    if level not in labels.keys():
        labels[level] = []
    if 'topic_keywords' in result.keys():
        keywords = ""
        for k in result['topic_keywords'][0]:
            keywords += (k[0]) + ", "
        labels[level].append(keywords)
        for j in range(len(result['child_data'])):
            add_label(result['child_data'][j], labels, level + 1)
    else:
        patent_nos = ""
        for k in result['patents']:
            patent_nos += (k).astype(str) + ', '
        labels[level].append(patent_nos)
    


In [122]:
add_label(result, labels, 1)

In [123]:
v_label = []
for i in range(10):
    if i in labels.keys():
        v_label += labels[i]
        
print len(v_label)

['root', u'electric, cool, mechanic, alternative, light, ', u'cool, dry, composite, thermal, acid, ', u'seal, sectional, peripheral, hydraulic, movable, ', u'dry, composite, acid, organic, chemical, ', u'light, optic, sectional, alternative, reflective, ', u'seal, sectional, secure, alternative, tight, ', u'flex, rigid, alternative, resilient, elastic, ', '8869772, 6345958, 5154593, 8242655, 7683499, 6691657, 5791558, 6948466, 8556241, 6718928, 7721551, 6981655, 5564394, 4795383, 5138997, 8716877, 6364637, 6293765, 4789315, 6547544, 4558994, 3975124, 8894391, 5458457, 6616425, 7392654, 8366392, 6129512, 8777562, 8216392, 4959963, 6318243, 6931981, 8544624, 5996341, 9677579, 4345436, 5579642, 6616247, 4211151, 6845793, 6718763, 8726646, 8997473, 8387378, 5561979, 6322341, 7946266, 8235146, 5347914, 6851533, 9638253, 8339731, 5624337, 5516155, 9234592, 9453558, 7735875, 5315465, 9188257, 7338241, 9562634, 7338211, 5472216, 9157493, 4886379, 6416434, 6155941, 8943841, 7493772, 4989832, 54

In [124]:
nr_vertices = len(v_label)
G = Graph.Tree(nr_vertices, 2) # 2 stands for children number
lay = G.layout('rt')

position = {k: lay[k] for k in range(nr_vertices)}
Y = [lay[k][1] for k in range(nr_vertices)]
M = max(Y)

es = EdgeSeq(G) # sequence of edges
E = [e.tuple for e in G.es] # list of edges

L = len(position)
Xn = [position[k][0] for k in range(L)]
Yn = [2*M-position[k][1] for k in range(L)]
Xe = []
Ye = []
for edge in E:
    Xe+=[position[edge[0]][0],position[edge[1]][0], None]
    Ye+=[2*M-position[edge[0]][1],2*M-position[edge[1]][1], None] 

labels = v_label

In [125]:
lines = go.Scatter(x=Xe,
                   y=Ye,
                   mode='lines',
                   line=dict(color='rgb(210,210,210)', width=1),
                   hoverinfo='none'
                   )
dots = go.Scatter(x=Xn,
                  y=Yn,
                  mode='markers',
                  name='',
                  marker=dict(symbol='dot',
                                size=18, 
                                color='#6175c1',    #'#DB4551', 
                                line=dict(color='rgb(50,50,50)', width=1)
                                ),
                  text=labels,
                  hoverinfo='text',
                  opacity=0.8
                  )

In [126]:
def make_annotations(pos, text, font_size=10, font_color='rgb(250,250,250)'):
    print 'position length', len(pos)
    print 'text length', len(text)
    L=len(pos)
    if len(text)!=L:
        raise ValueError('The lists pos and text must have the same len')
    annotations = go.Annotations()
    for k in range(L):
        annotations.append(
            go.Annotation(
                text=labels[k], # or replace labels with a different list for the text within the circle  
                x=pos[k][0], y=2*M-position[k][1],
                xref='x1', yref='y1',
                font=dict(color=font_color, size=font_size),
                showarrow=False)
        )
    return annotations  


In [127]:
axis = dict(showline=False, # hide axis line, grid, ticklabels and  title
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            )

layout = dict(title= 'Patent Behavior Data',  
              annotations=make_annotations(position, v_label),
              font=dict(size=12),
              showlegend=False,
              xaxis=go.XAxis(axis),
              yaxis=go.YAxis(axis),          
              margin=dict(l=40, r=40, b=85, t=100),
              hovermode='closest',
              plot_bgcolor='rgb(248,248,248)'          
              )


position length 26
text length 26


In [128]:
import plotly.tools as tools
data=go.Data([lines, dots])
tools.set_credentials_file(username='shiminzhang', api_key='ISAuicPfK4ING6FcRyeh')
fig=dict(data=data, layout=layout)
fig['layout'].update(annotations=make_annotations(position, v_label))
py.iplot(fig, filename='Tree-Reingold-Tilf')


position length 26
text length 26
High five! You successfully sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~shiminzhang/0 or inside your plot.ly account where it is named 'Tree-Reingold-Tilf'


In [94]:
https://github.com/vasturiano/3d-force-graph
    

SyntaxError: invalid syntax (<ipython-input-94-5b019649510a>, line 1)