In [41]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nimfa

## Data

In [42]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=remove)

In [43]:
newsgroups_train.filenames.shape, newsgroups_train.target.shape

((2034,), (2034,))

In [44]:
print("\n".join(newsgroups_train.data[:3]))

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych


Seems to be, barring evidence to the contrary, that Koresh was simply
another deranged fanatic who thought it neccessary to take a whole bunch of
folks with him, children and all, to satisfy his delusional mania. Jim
Jones, circa 1993.


Nope - fruitcakes like Koresh have been demonstrating such evil corruption
for centuries.

 >In article <1993Apr19.020359.26996@sq.sq.com>, msb@sq.sq.c

In [45]:
vectorizer = CountVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups_train.data).todense() # (documents, vocab)
vectors.shape #, vectors.nnz / vectors.shape[0], row_means.shape

(2034, 26576)

In [147]:
vocab = np.array(vectorizer.get_feature_names())

In [137]:
vocab.shape

(26576,)

## NMF

In [138]:
m,n=vectors.shape
d=5  # num topics

In [139]:
clf = decomposition.NMF(n_components=d, random_state=1)

W1 = clf.fit_transform(vectors)
H1 = clf.components_

In [140]:
num_top_words=8

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]

In [141]:
H1

array([[1.21260827e-01, 0.00000000e+00, 0.00000000e+00, ...,
        3.20158583e-05, 6.40317166e-05, 2.89955054e-04],
       [1.19285748e-01, 1.20852675e-01, 1.65916014e-04, ...,
        1.16499969e-04, 2.32999937e-04, 5.18641224e-02],
       [5.76639107e-02, 4.81606444e-01, 7.99303371e-04, ...,
        2.87622789e-04, 5.75245577e-04, 0.00000000e+00],
       [0.00000000e+00, 1.47943345e-01, 0.00000000e+00, ...,
        6.58515991e-05, 1.31703198e-04, 0.00000000e+00],
       [1.27421920e-01, 1.93564170e-01, 5.04719721e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [142]:
t=H1[0]
np.argsort(t)[:-num_top_words-1:-1]
[vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]

['jpeg', 'image', 'gif', 'file', 'color', 'images', 'format', 'quality']

In [52]:
show_topics(H1)

['jpeg image gif file color images format quality',
 'edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'image data available software processing ftp edu analysis']

## NIMFA

In [143]:
lsnmf = nimfa.Lsnmf(vectors, seed='random_vcol', rank=d, max_iter=100)
lsnmf_fit = lsnmf()

In [144]:
# fitted=lsnmf_fit.fit()

In [149]:
H = lsnmf_fit.fit.coef().T
H.shape

(26576, 5)

In [151]:
show_topics(np.array(H.T))

['jpeg image gif file color images format quality',
 'space launch satellite nasa commercial satellites year market',
 'image data available software processing ftp edu analysis',
 'edu graphics pub mail 128 ray ftp send',
 'jesus god people matthew atheists does atheism said']

## From scratch

In [58]:
lam=1e3
lr=1e-3
mu = 1e-6
def grads(M, W, H):
    R = W@H-M
    return R@H.T + penalty(W, mu)*lam, W.T@R + penalty(H, mu)*lam # dW, dH

In [59]:
def penalty(M, mu):
    return np.where(M>=mu,0, np.min(M - mu, 0))

In [60]:
def upd(M, W, H, lr):
    dW,dH = grads(M,W,H)
    W -= lr*dW 
    H -= lr*dH

In [61]:
def report(M,W,H): 
    print(np.linalg.norm(M-W@H), W.min(), H.min(), (W<0).sum(), (H<0).sum())

In [62]:
W = np.abs(np.random.normal(scale=0.01, size=(m,d)))
H = np.abs(np.random.normal(scale=0.01, size=(d,n)))

In [63]:
report(vectors, W, H)

937.394319325329 4.208720036561017e-07 1.864664115033727e-07 0 0


In [64]:
upd(vectors,W,H,lr)

In [65]:
report(vectors, W, H)

937.3019044616235 -9.790472461399145e-06 -4.557188270896063e-06 1 3


In [66]:
for i in range(50): 
    upd(vectors,W,H,lr)
    if i % 10 == 0: report(vectors,W,H)

937.1562610445386 -8.251572238422342e-05 -6.916790464710828e-06 7 4
845.2769786380487 -0.002101633764528055 -0.002994447295256277 194 13590
826.9514826848998 -0.03259065212254781 -0.08442809490244027 464 53242
782.9971186269846 -0.4848788444381057 -0.3847345867777468 1283 44853
734.9604187600945 -0.41941611133385576 -0.8112379591887059 818 43511


In [67]:
show_topics(H)

['graphics edu pub mail 128 3d ray send',
 'jpeg gif file color format image quality images',
 'image data available software images processing ftp display',
 'jesus god people matthew atheists does atheism space',
 'space nasa launch data satellite edu available commercial']

In [68]:
def nmf_fastai(X, k):
    
    m,n=X.shape
    lam=1e3
    lr=1e-3
    mu = 1e-6
    
    def penalty(M, mu):
        return np.where(M>=mu,0, np.min(M - mu, 0))
    
    def grads(M, W, H):
        R = W@H-M
        return R@H.T + penalty(W, mu)*lam, W.T@R + penalty(H, mu)*lam # dW, dH
    
    def upd(M, W, H, lr):
        dW,dH = grads(M,W,H)
        W -= lr*dW 
        H -= lr*dH
        
    def report(M,W,H): 
        print(np.linalg.norm(M-W@H), W.min(), H.min(), (W<0).sum(), (H<0).sum())
        
    W = np.abs(np.random.normal(scale=0.01, size=(m,k)))
    H = np.abs(np.random.normal(scale=0.01, size=(k,n)))
    
    for i in range(50): 
        upd(X,W,H,lr)
        if i % 10 == 0: report(X,W,H)
            
    return W, H

## Example

In [69]:
#http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/

def matrix_factorization(R, P, Q, K, steps=5000, alpha=0.0002, beta=0.02):
    Q = Q.T
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * (pow(P[i][k],2) + pow(Q[k][j],2))
        if e < 0.001:
            break
    return P, Q.T

In [70]:
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = np.array(R)

N = len(R)
M = len(R[0])
K = 2

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

nP, nQ = matrix_factorization(R, P, Q, K)
nR = np.dot(nP, nQ.T)

In [71]:
nR

array([[5.03948664, 2.79611505, 6.02967333, 0.99846336],
       [3.92724826, 2.18078197, 4.80751916, 0.99894544],
       [1.1194064 , 0.6593958 , 3.67327915, 4.96701983],
       [0.94005193, 0.55218392, 2.98958168, 3.97772638],
       [2.61554862, 1.47950793, 4.85349324, 4.02342868]])

## Example from here https://github.com/canerturkmen/nmflib/blob/master/nmf.py

In [120]:
def nmf(X, k, maxiter=10000, eps = 1e-7):
    """
    Euclidean distance reducing update rules for NMF, presented in Lee and Seung (2001)
    """

    m, n = X.shape
    V = X
    pdist = 1e9 #very large number

    W = np.random.rand(m, k)
    H = np.random.rand(k, n)
    print(f'W shape {W.shape}')
    print(f'H shape {H.shape}')
    print(f'V shape {V.shape}')

    dist_prev=None

    for i in range(maxiter):
        # multiplicative update steps, Euclidean error reducing
        H = H * np.asarray((( W.T @ V + eps) / (W.T @ W @ H + eps)))
        W = W * np.asarray(((V @ H.T + eps) / (W @ H @ H.T) + eps))
        # every 10 iterations, check convergence
        if i % 100 == 0:
            dist = np.linalg.norm(V-W @ H, 'fro')
            print(dist)
            if dist_prev is not None and np.absolute(dist-dist_prev)<0.0001:
                break
            dist_prev=dist
        
    return W, H

In [121]:
R = [
     [5,3,0,1],
     [4,0,0,1],
     [1,1,0,5],
     [1,0,0,4],
     [0,1,5,4],
    ]

R = np.array(R)

W, H=nmf(R, 2)

W shape (5, 2)
H shape (2, 4)
V shape (5, 4)
7.531827480813318
4.276530106063982
4.27652983055678


In [122]:
np.round(W@H,3)

array([[5.256, 1.993, 0.   , 1.455],
       [3.504, 1.329, 0.   , 0.97 ],
       [1.313, 0.944, 1.95 , 3.946],
       [0.981, 0.722, 1.528, 3.079],
       [0.   , 0.65 , 2.84 , 5.219]])

In [123]:
#compare with sklearn
clf = decomposition.NMF(n_components=2, random_state=1)

W1 = clf.fit_transform(R)
H1 = clf.components_

In [124]:
np.round(W1@H1,3)

array([[5.256, 1.993, 0.   , 1.455],
       [3.504, 1.329, 0.   , 0.97 ],
       [1.313, 0.944, 1.95 , 3.946],
       [0.981, 0.722, 1.528, 3.079],
       [0.   , 0.65 , 2.84 , 5.219]])

In [125]:
#compare with fastai
W2,H2=nmf_fastai(R, 3)

11.746743814598913 0.0005687431216813925 0.0004519828946395782 0 0
11.746588952462163 0.0012181328996124967 0.001966773056243306 0 0
11.746409327330042 0.0018954938136096855 0.0021821744870477873 0 0
11.746199399556309 0.0026058082716954515 0.00242456923499874 0 0
11.745952651604146 0.003354346783311343 0.002699778027101388 0 0


In [126]:
np.round(W2@H2,3)

array([[0.001, 0.001, 0.   , 0.001],
       [0.   , 0.001, 0.   , 0.001],
       [0.   , 0.001, 0.   , 0.001],
       [0.   , 0.001, 0.   , 0.001],
       [0.001, 0.001, 0.   , 0.001]])

In [127]:
vectors.shape

(2034, 26576)

In [131]:
#for topics
_, H_topic=nmf(vectors, 5, maxiter=1000)
show_topics(H_topic)

W shape (2034, 5)
H shape (5, 26576)
V shape (2034, 26576)
874.8311928437424
683.4242772527566
683.4054974846455
683.4015503759783
683.4003360760464
683.3999265257044
683.399721275439
683.3995992009104
683.3995270224874


['edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'jpeg image gif file color images format quality',
 'image data available software processing ftp edu analysis']

In [132]:
clf = decomposition.NMF(n_components=5, random_state=1)

W1_topic = clf.fit_transform(vectors)
H1_topic = clf.components_

In [133]:
show_topics(H1_topic)

['jpeg image gif file color images format quality',
 'edu graphics pub mail 128 ray ftp send',
 'space launch satellite nasa commercial satellites year market',
 'jesus god people matthew atheists does atheism said',
 'image data available software processing ftp edu analysis']